Spaces:
Sleeping
Sleeping
| from langchain.output_parsers import ResponseSchema, StructuredOutputParser | |
| import gradio as gr | |
| import os, json | |
| import cv2, pytesseract | |
| import fitz | |
| from concurrent.futures import ThreadPoolExecutor | |
| from PIL import Image | |
| from langchain_ollama import ChatOllama | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain | |
| # Import required libraries | |
| from langchain_ollama import OllamaEmbeddings, OllamaLLM | |
| from langchain_core.output_parsers import JsonOutputParser | |
| # from langchain_text_splitters import CharacterTextSplitter | |
| import chromadb | |
| import shutil | |
| import json | |
| import pandas as pd | |
| from langchain_core.documents import Document | |
| from pathlib import Path | |
| import regex as re | |
| import tempfile | |
| import zipfile | |
| from openai import OpenAI | |
| review_template = """ | |
| You are extracting structured information from the given text. | |
| ONLY use the information explicitly available in the text provided. | |
| If a specific field cannot be extracted from the input text, respond with 'null'. | |
| Input text: | |
| {text} | |
| Provide outputs in the following format: | |
| {format_instructions} | |
| """ | |
| # Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points. | |
| # Projects Experience | |
| name_schema = ResponseSchema(name="Name", | |
| description="Name of the person in resume text applying for job? Answer noun as string or unknown.") | |
| job_role_schema = ResponseSchema(name="Job_role", | |
| description="What is the job role the person is applying for?") | |
| skills_schema = ResponseSchema(name="Skills", | |
| description="All the skill in resume text and output them as a comma separated Python list.",type='list') | |
| exp_schema = ResponseSchema(name="Experience", | |
| description="How much experience in years he has in resume text which is a number",type='integer') | |
| info_schema = ResponseSchema(name="Personal Information", | |
| description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary') | |
| prof_schema = ResponseSchema(name="Profile", | |
| description="What is his profile he is mentioned in text for the job application and summarize it.",type='string') | |
| linkedin_schema = ResponseSchema(name="Linkedin", | |
| description="Linkedin link if available else unknown",type='string') | |
| # proj_schema = ResponseSchema(name="Projects Experience", | |
| # description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary') | |
| csv_path='./resumes.csv' | |
| # Initialize the DeepSeek client | |
| client = OpenAI( | |
| api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key | |
| base_url="https://api.deepseek.com/v1", | |
| ) | |
| # pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' | |
| # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
| # encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0 | |
| # ) | |
| from langchain_deepseek import ChatDeepSeek | |
| import os | |
| os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727' | |
| llm = ChatDeepSeek( | |
| model="deepseek-chat", | |
| temperature=0, | |
| max_tokens=None, | |
| timeout=None, | |
| max_retries=2, | |
| # other params... | |
| ) | |
| # embedding = OllamaEmbeddings(model="deepseek-chat") | |
| chat_prompt = ChatPromptTemplate.from_template(review_template) | |
| # chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions) | |
| # Specify the response schema all the attribute we are expecting | |
| response_schemas = [name_schema, | |
| job_role_schema, | |
| skills_schema, | |
| exp_schema, | |
| info_schema, | |
| prof_schema, | |
| linkedin_schema] | |
| # proj_schema | |
| output_parser = StructuredOutputParser.from_response_schemas(response_schemas) | |
| format_instructions = output_parser.get_format_instructions() | |
| def img_extract(img_file): | |
| img = cv2.imread(img_file) | |
| text = pytesseract.image_to_string(img) | |
| # print(text) | |
| return text | |
| def pdf_pages_image(page,matrix_scale=3): | |
| matrix=fitz.Matrix(matrix_scale,matrix_scale) | |
| image=page.get_pixmap(matrix=matrix) | |
| return Image.frombytes("RGB",[image.width,image.height],image.samples) | |
| def pdf_extract(pdf_file): | |
| pdf_text='' | |
| pdf_doc=fitz.open(pdf_file) | |
| num_pages=pdf_doc.page_count | |
| print(num_pages) | |
| with ThreadPoolExecutor(max_workers=10) as executor: | |
| images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages))) | |
| print(len(images)) | |
| for image in images: | |
| pdf_text+= pytesseract.image_to_string(image) | |
| return pdf_text | |
| def zip_extract(file): | |
| text='' | |
| # Open the zip file | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| # Get list of all file names in the zip | |
| for file_info in zip_ref.infolist(): | |
| print(f"File name: {file_info.filename}") | |
| file_path=Path(file_info.filename) | |
| print(file_path) | |
| def parsing(text): | |
| chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions) | |
| # print(chat_prompt_message) | |
| # Calling the LLM | |
| response = get_completion(chat_prompt_message) | |
| # Convert the result into the expected output format | |
| print(response) | |
| from langchain.output_parsers import OutputFixingParser | |
| # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm) | |
| json_parser = JsonOutputParser() | |
| output_dict=json_parser.invoke(response) | |
| print(output_dict) | |
| # output_dict=fix_parser.parse(response) | |
| # output_dict = output_parser.parse(response) # type: ignore | |
| print(type(output_dict)) | |
| return output_dict | |
| def file_extract(file,extension): | |
| text='' | |
| if extension.lower() in ('.png', '.jpg', '.jpeg'): | |
| text=img_extract(file) | |
| elif extension.lower()=='.pdf': | |
| print('pdf') | |
| text=pdf_extract(file) | |
| print(text) | |
| return text | |
| def get_completion(prompt): | |
| ai_msg = llm.invoke(prompt) | |
| return ai_msg.content | |
| def correct_OCR(text): | |
| try: | |
| response = client.chat.completions.create( | |
| model="deepseek-chat", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"}, | |
| {"role": "user", "content": f"Content:\n{text}"} | |
| ], | |
| temperature=0.7, | |
| ) | |
| ds_text = response.choices[0].message.content.strip() | |
| return ds_text | |
| except Exception as e: | |
| print(f"Error while correcting OCR: {e}") | |
| return text | |
| def processing(filepath): | |
| filename = os.path.basename(filepath) | |
| extension = os.path.splitext(filepath)[1] | |
| print(extension) | |
| extracted_text = file_extract(filepath, extension) | |
| corrected_text = correct_OCR(extracted_text) | |
| parsed_dict = parsing(corrected_text) # dict | |
| json_output = json.dumps(parsed_dict, indent=2) # For display purposes only | |
| # Flatten the nested JSON and convert to DataFrame | |
| df = pd.json_normalize(parsed_dict) | |
| # Ensure 'Skills' column is in string format for CSV | |
| df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) | |
| # Add the filename to the DataFrame | |
| df['filename'] = filename | |
| # Define consistent column order | |
| desired_columns = [ | |
| "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin", | |
| "Personal Information.Phone", "Personal Information.Gmail", | |
| "Personal Information.Address", "Personal Information.City", | |
| "filename" | |
| ] | |
| for col in desired_columns: | |
| if col not in df.columns: | |
| df[col] = None | |
| df = df[desired_columns] | |
| # Append to CSV only if Name is unique | |
| if os.path.exists(csv_path): | |
| existing_df = pd.read_csv(csv_path) | |
| if df['Name'].iloc[0] in existing_df['Name'].values: | |
| print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...") | |
| return json_output | |
| df.to_csv(csv_path, mode="a", index=False, header=False) | |
| else: | |
| df.to_csv(csv_path, mode="w", index=False, header=True) | |
| return json_output | |
| def resume_parser(filepath): | |
| print(filepath) | |
| if filepath: | |
| ext = os.path.splitext(filepath)[1] | |
| print(ext) # Output: .jpg | |
| if ext=='.pdf' or ext=='docx': | |
| print(filepath) | |
| json_output=processing(filepath) | |
| # # Push DataFrame into database as 'resume_data' table | |
| gr.Info('Data moved to database') | |
| return json_output | |
| elif ext=='.zip': | |
| zip_files = {} | |
| # Create temporary directory | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| print(f"Extracting to temp dir: {temp_dir}") | |
| with zipfile.ZipFile(filepath, 'r') as zip_ref: | |
| zip_ref.extractall(temp_dir) | |
| for file_info in zip_ref.infolist(): | |
| if file_info.is_dir(): | |
| continue | |
| try: | |
| extracted_file_path = os.path.join(temp_dir, file_info.filename) | |
| print(f"Processing: {extracted_file_path}") | |
| json_output = processing(extracted_file_path) | |
| gr.Info(f'{file_info.filename} moved to database') | |
| zip_files[file_info.filename] = 'processed' | |
| except Exception as err: | |
| print(str(err)) | |
| gr.Warning(f'{file_info.filename} not processed') | |
| zip_files[file_info.filename] = 'not processed' | |
| # 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up | |
| return json.dumps(zip_files) | |
| else: | |
| raise gr.Error('No file selected') | |
| def preprocess_skills(skill_text): | |
| # Split based on comma, slash, or space (optional) and lowercase | |
| if isinstance(skill_text, list): | |
| return [s.strip().lower() for s in skill_text] | |
| return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()] | |
| def get_filtered_rows(exp, skills_description): | |
| try: | |
| exp_filter = int(exp) if exp else None | |
| user_skills = preprocess_skills(skills_description) if skills_description else [] | |
| # Load CSV instead of DB | |
| df = pd.read_csv("./resumes.csv") | |
| # Return message if no input provided | |
| if not user_skills and exp_filter is None: | |
| return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}]) | |
| if user_skills: | |
| def skill_match_ratio(candidate_skills): | |
| candidate_list = preprocess_skills(candidate_skills) | |
| matches = len(set(user_skills) & set(candidate_list)) | |
| return matches / len(user_skills) if user_skills else 0 | |
| df["match_ratio"] = df["Skills"].apply(skill_match_ratio) | |
| df = df[df["match_ratio"] >= 0.4] # prioritize skills match | |
| # Now apply experience filter only if provided | |
| if exp_filter is not None: | |
| df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)] | |
| if not df.empty: | |
| df = df.sort_values( | |
| by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"], | |
| ascending=[False, False] | |
| ) | |
| return df.drop(columns=["match_ratio"], errors="ignore") | |
| else: | |
| return pd.DataFrame([{"Message": "No matching candidates found."}]) | |
| except Exception as e: | |
| return pd.DataFrame([{"Error": str(e)}]) |