PPSA's picture
Update main.py
3013983 verified
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import gradio as gr
import os, json
import cv2, pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
# Import required libraries
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_core.output_parsers import JsonOutputParser
# from langchain_text_splitters import CharacterTextSplitter
import chromadb
import shutil
import json
import pandas as pd
from langchain_core.documents import Document
from pathlib import Path
import regex as re
import tempfile
import zipfile
from openai import OpenAI
review_template = """
You are extracting structured information from the given text.
ONLY use the information explicitly available in the text provided.
If a specific field cannot be extracted from the input text, respond with 'null'.
Input text:
{text}
Provide outputs in the following format:
{format_instructions}
"""
# Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.
# Projects Experience
name_schema = ResponseSchema(name="Name",
description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
job_role_schema = ResponseSchema(name="Job_role",
description="What is the job role the person is applying for?")
skills_schema = ResponseSchema(name="Skills",
description="All the skill in resume text and output them as a comma separated Python list.",type='list')
exp_schema = ResponseSchema(name="Experience",
description="How much experience in years he has in resume text which is a number",type='integer')
info_schema = ResponseSchema(name="Personal Information",
description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
prof_schema = ResponseSchema(name="Profile",
description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
linkedin_schema = ResponseSchema(name="Linkedin",
description="Linkedin link if available else unknown",type='string')
# proj_schema = ResponseSchema(name="Projects Experience",
# description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')
csv_path='./resumes.csv'
# Initialize the DeepSeek client
client = OpenAI(
api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key
base_url="https://api.deepseek.com/v1",
)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
# encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )
from langchain_deepseek import ChatDeepSeek
import os
os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,
# other params...
)
# embedding = OllamaEmbeddings(model="deepseek-chat")
chat_prompt = ChatPromptTemplate.from_template(review_template)
# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)
# Specify the response schema all the attribute we are expecting
response_schemas = [name_schema,
job_role_schema,
skills_schema,
exp_schema,
info_schema,
prof_schema,
linkedin_schema]
# proj_schema
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
def img_extract(img_file):
img = cv2.imread(img_file)
text = pytesseract.image_to_string(img)
# print(text)
return text
def pdf_pages_image(page,matrix_scale=3):
matrix=fitz.Matrix(matrix_scale,matrix_scale)
image=page.get_pixmap(matrix=matrix)
return Image.frombytes("RGB",[image.width,image.height],image.samples)
def pdf_extract(pdf_file):
pdf_text=''
pdf_doc=fitz.open(pdf_file)
num_pages=pdf_doc.page_count
print(num_pages)
with ThreadPoolExecutor(max_workers=10) as executor:
images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
print(len(images))
for image in images:
pdf_text+= pytesseract.image_to_string(image)
return pdf_text
def zip_extract(file):
text=''
# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Get list of all file names in the zip
for file_info in zip_ref.infolist():
print(f"File name: {file_info.filename}")
file_path=Path(file_info.filename)
print(file_path)
def parsing(text):
chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
# print(chat_prompt_message)
# Calling the LLM
response = get_completion(chat_prompt_message)
# Convert the result into the expected output format
print(response)
from langchain.output_parsers import OutputFixingParser
# fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
json_parser = JsonOutputParser()
output_dict=json_parser.invoke(response)
print(output_dict)
# output_dict=fix_parser.parse(response)
# output_dict = output_parser.parse(response) # type: ignore
print(type(output_dict))
return output_dict
def file_extract(file,extension):
text=''
if extension.lower() in ('.png', '.jpg', '.jpeg'):
text=img_extract(file)
elif extension.lower()=='.pdf':
print('pdf')
text=pdf_extract(file)
print(text)
return text
def get_completion(prompt):
ai_msg = llm.invoke(prompt)
return ai_msg.content
def correct_OCR(text):
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"},
{"role": "user", "content": f"Content:\n{text}"}
],
temperature=0.7,
)
ds_text = response.choices[0].message.content.strip()
return ds_text
except Exception as e:
print(f"Error while correcting OCR: {e}")
return text
def processing(filepath):
filename = os.path.basename(filepath)
extension = os.path.splitext(filepath)[1]
print(extension)
extracted_text = file_extract(filepath, extension)
corrected_text = correct_OCR(extracted_text)
parsed_dict = parsing(corrected_text) # dict
json_output = json.dumps(parsed_dict, indent=2) # For display purposes only
# Flatten the nested JSON and convert to DataFrame
df = pd.json_normalize(parsed_dict)
# Ensure 'Skills' column is in string format for CSV
df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
# Add the filename to the DataFrame
df['filename'] = filename
# Define consistent column order
desired_columns = [
"Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
"Personal Information.Phone", "Personal Information.Gmail",
"Personal Information.Address", "Personal Information.City",
"filename"
]
for col in desired_columns:
if col not in df.columns:
df[col] = None
df = df[desired_columns]
# Append to CSV only if Name is unique
if os.path.exists(csv_path):
existing_df = pd.read_csv(csv_path)
if df['Name'].iloc[0] in existing_df['Name'].values:
print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
return json_output
df.to_csv(csv_path, mode="a", index=False, header=False)
else:
df.to_csv(csv_path, mode="w", index=False, header=True)
return json_output
def resume_parser(filepath):
print(filepath)
if filepath:
ext = os.path.splitext(filepath)[1]
print(ext) # Output: .jpg
if ext=='.pdf' or ext=='docx':
print(filepath)
json_output=processing(filepath)
# # Push DataFrame into database as 'resume_data' table
gr.Info('Data moved to database')
return json_output
elif ext=='.zip':
zip_files = {}
# Create temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
print(f"Extracting to temp dir: {temp_dir}")
with zipfile.ZipFile(filepath, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
for file_info in zip_ref.infolist():
if file_info.is_dir():
continue
try:
extracted_file_path = os.path.join(temp_dir, file_info.filename)
print(f"Processing: {extracted_file_path}")
json_output = processing(extracted_file_path)
gr.Info(f'{file_info.filename} moved to database')
zip_files[file_info.filename] = 'processed'
except Exception as err:
print(str(err))
gr.Warning(f'{file_info.filename} not processed')
zip_files[file_info.filename] = 'not processed'
# 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up
return json.dumps(zip_files)
else:
raise gr.Error('No file selected')
def preprocess_skills(skill_text):
# Split based on comma, slash, or space (optional) and lowercase
if isinstance(skill_text, list):
return [s.strip().lower() for s in skill_text]
return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]
def get_filtered_rows(exp, skills_description):
try:
exp_filter = int(exp) if exp else None
user_skills = preprocess_skills(skills_description) if skills_description else []
# Load CSV instead of DB
df = pd.read_csv("./resumes.csv")
# Return message if no input provided
if not user_skills and exp_filter is None:
return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])
if user_skills:
def skill_match_ratio(candidate_skills):
candidate_list = preprocess_skills(candidate_skills)
matches = len(set(user_skills) & set(candidate_list))
return matches / len(user_skills) if user_skills else 0
df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
df = df[df["match_ratio"] >= 0.4] # prioritize skills match
# Now apply experience filter only if provided
if exp_filter is not None:
df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]
if not df.empty:
df = df.sort_values(
by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
ascending=[False, False]
)
return df.drop(columns=["match_ratio"], errors="ignore")
else:
return pd.DataFrame([{"Message": "No matching candidates found."}])
except Exception as e:
return pd.DataFrame([{"Error": str(e)}])