from langchain.output_parsers import ResponseSchema, StructuredOutputParser import gradio as gr import os, json import cv2, pytesseract import fitz from concurrent.futures import ThreadPoolExecutor from PIL import Image from langchain_ollama import ChatOllama from langchain_core.prompts import ChatPromptTemplate from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain # Import required libraries from langchain_ollama import OllamaEmbeddings, OllamaLLM from langchain_core.output_parsers import JsonOutputParser # from langchain_text_splitters import CharacterTextSplitter import chromadb import shutil import json import pandas as pd from langchain_core.documents import Document from pathlib import Path import regex as re import tempfile import zipfile from openai import OpenAI review_template = """ You are extracting structured information from the given text. ONLY use the information explicitly available in the text provided. If a specific field cannot be extracted from the input text, respond with 'null'. Input text: {text} Provide outputs in the following format: {format_instructions} """ # Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points. # Projects Experience name_schema = ResponseSchema(name="Name", description="Name of the person in resume text applying for job? Answer noun as string or unknown.") job_role_schema = ResponseSchema(name="Job_role", description="What is the job role the person is applying for?") skills_schema = ResponseSchema(name="Skills", description="All the skill in resume text and output them as a comma separated Python list.",type='list') exp_schema = ResponseSchema(name="Experience", description="How much experience in years he has in resume text which is a number",type='integer') info_schema = ResponseSchema(name="Personal Information", description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary') prof_schema = ResponseSchema(name="Profile", description="What is his profile he is mentioned in text for the job application and summarize it.",type='string') linkedin_schema = ResponseSchema(name="Linkedin", description="Linkedin link if available else unknown",type='string') # proj_schema = ResponseSchema(name="Projects Experience", # description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary') csv_path='./resumes.csv' # Initialize the DeepSeek client client = OpenAI( api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key base_url="https://api.deepseek.com/v1", ) # pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( # encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0 # ) from langchain_deepseek import ChatDeepSeek import os os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727' llm = ChatDeepSeek( model="deepseek-chat", temperature=0, max_tokens=None, timeout=None, max_retries=2, # other params... ) # embedding = OllamaEmbeddings(model="deepseek-chat") chat_prompt = ChatPromptTemplate.from_template(review_template) # chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions) # Specify the response schema all the attribute we are expecting response_schemas = [name_schema, job_role_schema, skills_schema, exp_schema, info_schema, prof_schema, linkedin_schema] # proj_schema output_parser = StructuredOutputParser.from_response_schemas(response_schemas) format_instructions = output_parser.get_format_instructions() def img_extract(img_file): img = cv2.imread(img_file) text = pytesseract.image_to_string(img) # print(text) return text def pdf_pages_image(page,matrix_scale=3): matrix=fitz.Matrix(matrix_scale,matrix_scale) image=page.get_pixmap(matrix=matrix) return Image.frombytes("RGB",[image.width,image.height],image.samples) def pdf_extract(pdf_file): pdf_text='' pdf_doc=fitz.open(pdf_file) num_pages=pdf_doc.page_count print(num_pages) with ThreadPoolExecutor(max_workers=10) as executor: images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages))) print(len(images)) for image in images: pdf_text+= pytesseract.image_to_string(image) return pdf_text def zip_extract(file): text='' # Open the zip file with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Get list of all file names in the zip for file_info in zip_ref.infolist(): print(f"File name: {file_info.filename}") file_path=Path(file_info.filename) print(file_path) def parsing(text): chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions) # print(chat_prompt_message) # Calling the LLM response = get_completion(chat_prompt_message) # Convert the result into the expected output format print(response) from langchain.output_parsers import OutputFixingParser # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm) json_parser = JsonOutputParser() output_dict=json_parser.invoke(response) print(output_dict) # output_dict=fix_parser.parse(response) # output_dict = output_parser.parse(response) # type: ignore print(type(output_dict)) return output_dict def file_extract(file,extension): text='' if extension.lower() in ('.png', '.jpg', '.jpeg'): text=img_extract(file) elif extension.lower()=='.pdf': print('pdf') text=pdf_extract(file) print(text) return text def get_completion(prompt): ai_msg = llm.invoke(prompt) return ai_msg.content def correct_OCR(text): try: response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"}, {"role": "user", "content": f"Content:\n{text}"} ], temperature=0.7, ) ds_text = response.choices[0].message.content.strip() return ds_text except Exception as e: print(f"Error while correcting OCR: {e}") return text def processing(filepath): filename = os.path.basename(filepath) extension = os.path.splitext(filepath)[1] print(extension) extracted_text = file_extract(filepath, extension) corrected_text = correct_OCR(extracted_text) parsed_dict = parsing(corrected_text) # dict json_output = json.dumps(parsed_dict, indent=2) # For display purposes only # Flatten the nested JSON and convert to DataFrame df = pd.json_normalize(parsed_dict) # Ensure 'Skills' column is in string format for CSV df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) # Add the filename to the DataFrame df['filename'] = filename # Define consistent column order desired_columns = [ "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin", "Personal Information.Phone", "Personal Information.Gmail", "Personal Information.Address", "Personal Information.City", "filename" ] for col in desired_columns: if col not in df.columns: df[col] = None df = df[desired_columns] # Append to CSV only if Name is unique if os.path.exists(csv_path): existing_df = pd.read_csv(csv_path) if df['Name'].iloc[0] in existing_df['Name'].values: print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...") return json_output df.to_csv(csv_path, mode="a", index=False, header=False) else: df.to_csv(csv_path, mode="w", index=False, header=True) return json_output def resume_parser(filepath): print(filepath) if filepath: ext = os.path.splitext(filepath)[1] print(ext) # Output: .jpg if ext=='.pdf' or ext=='docx': print(filepath) json_output=processing(filepath) # # Push DataFrame into database as 'resume_data' table gr.Info('Data moved to database') return json_output elif ext=='.zip': zip_files = {} # Create temporary directory with tempfile.TemporaryDirectory() as temp_dir: print(f"Extracting to temp dir: {temp_dir}") with zipfile.ZipFile(filepath, 'r') as zip_ref: zip_ref.extractall(temp_dir) for file_info in zip_ref.infolist(): if file_info.is_dir(): continue try: extracted_file_path = os.path.join(temp_dir, file_info.filename) print(f"Processing: {extracted_file_path}") json_output = processing(extracted_file_path) gr.Info(f'{file_info.filename} moved to database') zip_files[file_info.filename] = 'processed' except Exception as err: print(str(err)) gr.Warning(f'{file_info.filename} not processed') zip_files[file_info.filename] = 'not processed' # 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up return json.dumps(zip_files) else: raise gr.Error('No file selected') def preprocess_skills(skill_text): # Split based on comma, slash, or space (optional) and lowercase if isinstance(skill_text, list): return [s.strip().lower() for s in skill_text] return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()] def get_filtered_rows(exp, skills_description): try: exp_filter = int(exp) if exp else None user_skills = preprocess_skills(skills_description) if skills_description else [] # Load CSV instead of DB df = pd.read_csv("./resumes.csv") # Return message if no input provided if not user_skills and exp_filter is None: return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}]) if user_skills: def skill_match_ratio(candidate_skills): candidate_list = preprocess_skills(candidate_skills) matches = len(set(user_skills) & set(candidate_list)) return matches / len(user_skills) if user_skills else 0 df["match_ratio"] = df["Skills"].apply(skill_match_ratio) df = df[df["match_ratio"] >= 0.4] # prioritize skills match # Now apply experience filter only if provided if exp_filter is not None: df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)] if not df.empty: df = df.sort_values( by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"], ascending=[False, False] ) return df.drop(columns=["match_ratio"], errors="ignore") else: return pd.DataFrame([{"Message": "No matching candidates found."}]) except Exception as e: return pd.DataFrame([{"Error": str(e)}])