Spaces:

I2E
/

asha_resume_parser

Sleeping

App Files Files Community

asha_resume_parser / main.py

PPSA

Update main.py

3013983 verified 10 months ago

raw

history blame contribute delete

12.6 kB

	from langchain.output_parsers import ResponseSchema, StructuredOutputParser
	import gradio as gr
	import os, json
	import cv2, pytesseract
	import fitz
	from concurrent.futures import ThreadPoolExecutor
	from PIL import Image
	from langchain_ollama import ChatOllama
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
	from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
	# Import required libraries
	from langchain_ollama import OllamaEmbeddings, OllamaLLM
	from langchain_core.output_parsers import JsonOutputParser
	# from langchain_text_splitters import CharacterTextSplitter
	import chromadb
	import shutil
	import json
	import pandas as pd
	from langchain_core.documents import Document
	from pathlib import Path
	import regex as re
	import tempfile
	import zipfile
	from openai import OpenAI

	review_template = """
	You are extracting structured information from the given text.
	ONLY use the information explicitly available in the text provided.
	If a specific field cannot be extracted from the input text, respond with 'null'.

	Input text:
	{text}

	Provide outputs in the following format:
	{format_instructions}
	"""


	# Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.
	# Projects Experience
	name_schema = ResponseSchema(name="Name",
	description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
	job_role_schema = ResponseSchema(name="Job_role",
	description="What is the job role the person is applying for?")
	skills_schema = ResponseSchema(name="Skills",
	description="All the skill in resume text and output them as a comma separated Python list.",type='list')
	exp_schema = ResponseSchema(name="Experience",
	description="How much experience in years he has in resume text which is a number",type='integer')
	info_schema = ResponseSchema(name="Personal Information",
	description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
	prof_schema = ResponseSchema(name="Profile",
	description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
	linkedin_schema = ResponseSchema(name="Linkedin",
	description="Linkedin link if available else unknown",type='string')
	# proj_schema = ResponseSchema(name="Projects Experience",
	# description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')


	csv_path='./resumes.csv'

	# Initialize the DeepSeek client
	client = OpenAI(
	api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key
	base_url="https://api.deepseek.com/v1",
	)


	# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
	# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	# encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
	# )



	from langchain_deepseek import ChatDeepSeek
	import os
	os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
	llm = ChatDeepSeek(
	model="deepseek-chat",
	temperature=0,
	max_tokens=None,
	timeout=None,
	max_retries=2,
	# other params...
	)

	# embedding = OllamaEmbeddings(model="deepseek-chat")

	chat_prompt = ChatPromptTemplate.from_template(review_template)
	# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)

	# Specify the response schema all the attribute we are expecting
	response_schemas = [name_schema,
	job_role_schema,
	skills_schema,
	exp_schema,
	info_schema,
	prof_schema,
	linkedin_schema]

	# proj_schema

	output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
	format_instructions = output_parser.get_format_instructions()

	def img_extract(img_file):
	img = cv2.imread(img_file)
	text = pytesseract.image_to_string(img)
	# print(text)
	return text

	def pdf_pages_image(page,matrix_scale=3):
	matrix=fitz.Matrix(matrix_scale,matrix_scale)
	image=page.get_pixmap(matrix=matrix)
	return Image.frombytes("RGB",[image.width,image.height],image.samples)

	def pdf_extract(pdf_file):
	pdf_text=''
	pdf_doc=fitz.open(pdf_file)
	num_pages=pdf_doc.page_count
	print(num_pages)
	with ThreadPoolExecutor(max_workers=10) as executor:
	images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
	print(len(images))
	for image in images:
	pdf_text+= pytesseract.image_to_string(image)
	return pdf_text

	def zip_extract(file):
	text=''
	# Open the zip file
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	# Get list of all file names in the zip
	for file_info in zip_ref.infolist():
	print(f"File name: {file_info.filename}")
	file_path=Path(file_info.filename)
	print(file_path)

	def parsing(text):
	chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
	# print(chat_prompt_message)
	# Calling the LLM
	response = get_completion(chat_prompt_message)
	# Convert the result into the expected output format
	print(response)
	from langchain.output_parsers import OutputFixingParser
	# fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
	json_parser = JsonOutputParser()
	output_dict=json_parser.invoke(response)
	print(output_dict)
	# output_dict=fix_parser.parse(response)
	# output_dict = output_parser.parse(response) # type: ignore
	print(type(output_dict))
	return output_dict



	def file_extract(file,extension):
	text=''
	if extension.lower() in ('.png', '.jpg', '.jpeg'):
	text=img_extract(file)
	elif extension.lower()=='.pdf':
	print('pdf')
	text=pdf_extract(file)
	print(text)

	return text

	def get_completion(prompt):
	ai_msg = llm.invoke(prompt)
	return ai_msg.content


	def correct_OCR(text):
	try:
	response = client.chat.completions.create(
	model="deepseek-chat",
	messages=[
	{"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"},
	{"role": "user", "content": f"Content:\n{text}"}
	],
	temperature=0.7,
	)
	ds_text = response.choices[0].message.content.strip()
	return ds_text
	except Exception as e:
	print(f"Error while correcting OCR: {e}")
	return text

	def processing(filepath):
	filename = os.path.basename(filepath)
	extension = os.path.splitext(filepath)[1]
	print(extension)

	extracted_text = file_extract(filepath, extension)
	corrected_text = correct_OCR(extracted_text)

	parsed_dict = parsing(corrected_text) # dict
	json_output = json.dumps(parsed_dict, indent=2) # For display purposes only

	# Flatten the nested JSON and convert to DataFrame
	df = pd.json_normalize(parsed_dict)

	# Ensure 'Skills' column is in string format for CSV
	df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

	# Add the filename to the DataFrame
	df['filename'] = filename

	# Define consistent column order
	desired_columns = [
	"Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
	"Personal Information.Phone", "Personal Information.Gmail",
	"Personal Information.Address", "Personal Information.City",
	"filename"
	]
	for col in desired_columns:
	if col not in df.columns:
	df[col] = None
	df = df[desired_columns]

	# Append to CSV only if Name is unique
	if os.path.exists(csv_path):
	existing_df = pd.read_csv(csv_path)
	if df['Name'].iloc[0] in existing_df['Name'].values:
	print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
	return json_output
	df.to_csv(csv_path, mode="a", index=False, header=False)
	else:
	df.to_csv(csv_path, mode="w", index=False, header=True)
	return json_output


	def resume_parser(filepath):
	print(filepath)
	if filepath:
	ext = os.path.splitext(filepath)[1]
	print(ext) # Output: .jpg
	if ext=='.pdf' or ext=='docx':
	print(filepath)
	json_output=processing(filepath)
	# # Push DataFrame into database as 'resume_data' table
	gr.Info('Data moved to database')
	return json_output
	elif ext=='.zip':
	zip_files = {}

	# Create temporary directory
	with tempfile.TemporaryDirectory() as temp_dir:
	print(f"Extracting to temp dir: {temp_dir}")

	with zipfile.ZipFile(filepath, 'r') as zip_ref:
	zip_ref.extractall(temp_dir)

	for file_info in zip_ref.infolist():
	if file_info.is_dir():
	continue

	try:
	extracted_file_path = os.path.join(temp_dir, file_info.filename)
	print(f"Processing: {extracted_file_path}")

	json_output = processing(extracted_file_path)
	gr.Info(f'{file_info.filename} moved to database')
	zip_files[file_info.filename] = 'processed'

	except Exception as err:
	print(str(err))
	gr.Warning(f'{file_info.filename} not processed')
	zip_files[file_info.filename] = 'not processed'

	# 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up
	return json.dumps(zip_files)

	else:
	raise gr.Error('No file selected')


	def preprocess_skills(skill_text):
	# Split based on comma, slash, or space (optional) and lowercase
	if isinstance(skill_text, list):
	return [s.strip().lower() for s in skill_text]
	return [s.strip().lower() for s in re.split(r"[,/\|&\-\s]+", str(skill_text)) if s.strip()]

	def get_filtered_rows(exp, skills_description):
	try:
	exp_filter = int(exp) if exp else None
	user_skills = preprocess_skills(skills_description) if skills_description else []

	# Load CSV instead of DB
	df = pd.read_csv("./resumes.csv")

	# Return message if no input provided
	if not user_skills and exp_filter is None:
	return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])

	if user_skills:
	def skill_match_ratio(candidate_skills):
	candidate_list = preprocess_skills(candidate_skills)
	matches = len(set(user_skills) & set(candidate_list))
	return matches / len(user_skills) if user_skills else 0

	df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
	df = df[df["match_ratio"] >= 0.4] # prioritize skills match

	# Now apply experience filter only if provided
	if exp_filter is not None:
	df = df[(df["Experience"].isna()) \| (df["Experience"] >= exp_filter)]

	if not df.empty:
	df = df.sort_values(
	by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
	ascending=[False, False]
	)
	return df.drop(columns=["match_ratio"], errors="ignore")
	else:
	return pd.DataFrame([{"Message": "No matching candidates found."}])

	except Exception as e:
	return pd.DataFrame([{"Error": str(e)}])