Spaces:

I2E
/

asha_resume_parser

Sleeping

App Files Files Community

PPSA commited on Apr 15, 2025

Commit

cb5cfea

verified ·

1 Parent(s): 023406a

Upload 3 files

Browse files

Files changed (3) hide show

app.py +30 -0
main.py +288 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import gradio as gr
+from main import resume_parser, get_filtered_rows
+with gr.Blocks() as demo:
+    with gr.Tabs():
+        # Tab 1: File / Folder Upload
+        with gr.Tab("Resume Parser"):
+            gr.Markdown("### 📂 Upload Resume or Folder")
+            with gr.Row():
+                file_input = gr.File(label="Upload Resume (PDF, DOCX)")
+                folder_input = gr.FileExplorer(label="Select Folder")
+            parse_btn = gr.Button("Parse")
+            output_json = gr.Code(label="Parsed JSON", language="json")
+            parse_btn.click(fn=resume_parser, inputs=[file_input, folder_input], outputs=output_json)
+        # Tab 2: Text to DataFrame
+        with gr.Tab("Resume Filter"):
+            gr.Markdown("### 🔍 Resume Filter by Experience and Skills")
+            with gr.Row():
+                exp_input = gr.Number(label="Minimum Experience (Years)")
+                skills_input = gr.Textbox(label="Required Skills (comma-separated)", placeholder="e.g. Python, SQL, AWS")
+            submit_btn = gr.Button("Filter Resumes")
+            output = gr.Dataframe(label="Matching Candidates")
+            submit_btn.click(fn=get_filtered_rows, inputs=[exp_input, skills_input], outputs=output)
+if __name__ == "__main__":
+    demo.launch(inbrowser=True)

main.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from langchain.output_parsers import ResponseSchema, StructuredOutputParser
+import gradio as gr
+import os, json
+import cv2, pytesseract
+import fitz
+from concurrent.futures import ThreadPoolExecutor
+from PIL import Image
+from langchain_ollama import ChatOllama
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
+from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
+# Import required libraries
+from langchain_ollama import OllamaEmbeddings, OllamaLLM
+from langchain_core.output_parsers import JsonOutputParser
+# from langchain_text_splitters import CharacterTextSplitter
+import chromadb
+import shutil
+import json
+import pandas as pd
+from langchain_core.documents import Document
+from pathlib import Path
+import regex as re
+from openai import OpenAI
+review_template = """
+You are extracting structured information from the given text.
+ONLY use the information explicitly available in the text provided.
+If a specific field cannot be extracted from the input text, respond with 'null'.
+Input text:
+{text}
+Provide outputs in the following format:
+{format_instructions}
+"""
+# Projects Experience: Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.
+# Projects Experience
+name_schema = ResponseSchema(name="Name",
+                         description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
+job_role_schema = ResponseSchema(name="Job_role",
+                                    description="What is the job role the person is applying for?")
+skills_schema = ResponseSchema(name="Skills",
+                                    description="All the skill in resume text  and output them as a comma separated Python list.",type='list')
+exp_schema = ResponseSchema(name="Experience",
+                                    description="How much experience in years he has in resume text which is a number",type='integer')
+info_schema = ResponseSchema(name="Personal Information",
+                                    description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
+prof_schema = ResponseSchema(name="Profile",
+                                    description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
+linkedin_schema = ResponseSchema(name="Linkedin",
+                                    description="Linkedin link if available else unknown",type='string')
+# proj_schema = ResponseSchema(name="Projects Experience",
+#                                     description="Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')
+csv_path='./resumes.csv'
+# Initialize the DeepSeek client
+client = OpenAI(
+    api_key='sk-02f34bd0ea4849e8a4232bc656e28727',  # Replace with your DeepSeek API key
+    base_url="https://api.deepseek.com/v1",
+)
+pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
+# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+#     encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
+# )
+from langchain_deepseek import ChatDeepSeek
+import os
+os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
+llm = ChatDeepSeek(
+    model="deepseek-chat",
+    temperature=0,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    # other params...
+)
+# embedding = OllamaEmbeddings(model="deepseek-chat")
+chat_prompt = ChatPromptTemplate.from_template(review_template)
+# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)
+# Specify the response schema all the attribute we are expecting
+response_schemas = [name_schema,
+                    job_role_schema,
+                    skills_schema,
+                   exp_schema,
+                   info_schema,
+                   prof_schema,
+                   linkedin_schema]
+# proj_schema
+output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
+format_instructions = output_parser.get_format_instructions()
+def img_extract(img_file):
+    img = cv2.imread(img_file)
+    text = pytesseract.image_to_string(img)
+    # print(text)
+    return text
+def pdf_pages_image(page,matrix_scale=3):
+    matrix=fitz.Matrix(matrix_scale,matrix_scale)
+    image=page.get_pixmap(matrix=matrix)
+    return Image.frombytes("RGB",[image.width,image.height],image.samples)
+def pdf_extract(pdf_file):
+    pdf_text=''
+    pdf_doc=fitz.open(pdf_file)
+    num_pages=pdf_doc.page_count
+    print(num_pages)
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
+        print(len(images))
+        for image in images:
+            pdf_text+= pytesseract.image_to_string(image)
+    return pdf_text
+def parsing(text):
+    chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
+    # print(chat_prompt_message)
+    # Calling the LLM
+    response = get_completion(chat_prompt_message)
+    # Convert the result into the expected output format
+    print(response)
+    from langchain.output_parsers import OutputFixingParser
+    # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
+    json_parser = JsonOutputParser()
+    output_dict=json_parser.invoke(response)
+    print(output_dict)
+    # output_dict=fix_parser.parse(response)
+    # output_dict = output_parser.parse(response) # type: ignore
+    print("Expected delivery in days: ",output_dict)
+    print(type(output_dict))
+    return output_dict
+def file_extract(file,extension):
+    text=''
+    if extension.lower() in ('.png', '.jpg', '.jpeg'):
+        text=img_extract(file)
+    elif extension.lower()=='.pdf':
+        print('pdf')
+        text=pdf_extract(file)
+    print(text)
+    return text
+def get_completion(prompt):
+    ai_msg = llm.invoke(prompt)
+    return ai_msg.content
+def correct_OCR(text):
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into  respective headings to respective text"},
+                {"role": "user", "content": f"Content:\n{text}"}
+            ],
+            temperature=0.7,
+        )
+        ds_text = response.choices[0].message.content.strip()
+        return ds_text
+    except Exception as e:
+        print(f"Error while correcting OCR: {e}")
+        return text
+def processing(filepath):
+    filename = os.path.basename(filepath)
+    extension = os.path.splitext(filepath)[1]
+    print(extension)
+    extracted_text = file_extract(filepath, extension)
+    corrected_text = correct_OCR(extracted_text)
+    parsed_dict = parsing(corrected_text)  # dict
+    json_output = json.dumps(parsed_dict, indent=2)  # For display purposes only
+    # Flatten the nested JSON and convert to DataFrame
+    df = pd.json_normalize(parsed_dict)
+    # Ensure 'Skills' column is in string format for CSV
+    df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
+    # Add the filename to the DataFrame
+    df['filename'] = filename
+    # Define consistent column order
+    desired_columns = [
+        "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
+        "Personal Information.Phone", "Personal Information.Gmail",
+        "Personal Information.Address", "Personal Information.City",
+        "filename"
+    ]
+    for col in desired_columns:
+        if col not in df.columns:
+            df[col] = None
+    df = df[desired_columns]
+    # Append to CSV only if Name is unique
+    if os.path.exists(csv_path):
+        existing_df = pd.read_csv(csv_path)
+        if df['Name'].iloc[0] in existing_df['Name'].values:
+            print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
+            return json_output
+        df.to_csv(csv_path, mode="a", index=False, header=False)
+    else:
+        df.to_csv(csv_path, mode="w", index=False, header=True)
+    return json_output
+def resume_parser(filepath,folder):
+    if filepath:
+        print(filepath)
+        json_output=processing(filepath)
+        # Push DataFrame into database as 'resume_data' table
+        gr.Info('Data moved to database')
+        return json_output
+    elif folder:
+        print(folder)
+        files = folder[1:]  # skip the first item (it's the folder path)
+        results = []
+        for file_path in files:
+            file_path=Path(file_path)
+            json_output=processing(file_path)
+            return gr.Info('Files moved to database')
+    else:
+        raise gr.Error('No file selected')
+def preprocess_skills(skill_text):
+    # Split based on comma, slash, or space (optional) and lowercase
+    if isinstance(skill_text, list):
+        return [s.strip().lower() for s in skill_text]
+    return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]
+def get_filtered_rows(exp, skills_description):
+    try:
+        exp_filter = int(exp) if exp else None
+        user_skills = preprocess_skills(skills_description) if skills_description else []
+        # Load CSV instead of DB
+        df = pd.read_csv("./resumes.csv")
+        # Return message if no input provided
+        if not user_skills and exp_filter is None:
+            return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])
+        if user_skills:
+            def skill_match_ratio(candidate_skills):
+                candidate_list = preprocess_skills(candidate_skills)
+                matches = len(set(user_skills) & set(candidate_list))
+                return matches / len(user_skills) if user_skills else 0
+            df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
+            df = df[df["match_ratio"] >= 0.4]  # prioritize skills match
+        # Now apply experience filter only if provided
+        if exp_filter is not None:
+            df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]
+        if not df.empty:
+            df = df.sort_values(
+                by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
+                ascending=[False, False]
+            )
+            return df.drop(columns=["match_ratio"], errors="ignore")
+        else:
+            return pd.DataFrame([{"Message": "No matching candidates found."}])
+    except Exception as e:
+        return pd.DataFrame([{"Error": str(e)}])

requirements.txt ADDED Viewed

Binary file (66.5 kB). View file