Spaces:
Sleeping
Sleeping
File size: 12,614 Bytes
074b364 3013983 074b364 3013983 074b364 3013983 074b364 3013983 074b364 cb5cfea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import gradio as gr
import os, json
import cv2, pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
# Import required libraries
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_core.output_parsers import JsonOutputParser
# from langchain_text_splitters import CharacterTextSplitter
import chromadb
import shutil
import json
import pandas as pd
from langchain_core.documents import Document
from pathlib import Path
import regex as re
import tempfile
import zipfile
from openai import OpenAI
review_template = """
You are extracting structured information from the given text.
ONLY use the information explicitly available in the text provided.
If a specific field cannot be extracted from the input text, respond with 'null'.
Input text:
{text}
Provide outputs in the following format:
{format_instructions}
"""
# Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.
# Projects Experience
name_schema = ResponseSchema(name="Name",
description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
job_role_schema = ResponseSchema(name="Job_role",
description="What is the job role the person is applying for?")
skills_schema = ResponseSchema(name="Skills",
description="All the skill in resume text and output them as a comma separated Python list.",type='list')
exp_schema = ResponseSchema(name="Experience",
description="How much experience in years he has in resume text which is a number",type='integer')
info_schema = ResponseSchema(name="Personal Information",
description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
prof_schema = ResponseSchema(name="Profile",
description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
linkedin_schema = ResponseSchema(name="Linkedin",
description="Linkedin link if available else unknown",type='string')
# proj_schema = ResponseSchema(name="Projects Experience",
# description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')
csv_path='./resumes.csv'
# Initialize the DeepSeek client
client = OpenAI(
api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key
base_url="https://api.deepseek.com/v1",
)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
# encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )
from langchain_deepseek import ChatDeepSeek
import os
os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0,
max_tokens=None,
timeout=None,
max_retries=2,
# other params...
)
# embedding = OllamaEmbeddings(model="deepseek-chat")
chat_prompt = ChatPromptTemplate.from_template(review_template)
# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)
# Specify the response schema all the attribute we are expecting
response_schemas = [name_schema,
job_role_schema,
skills_schema,
exp_schema,
info_schema,
prof_schema,
linkedin_schema]
# proj_schema
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
def img_extract(img_file):
img = cv2.imread(img_file)
text = pytesseract.image_to_string(img)
# print(text)
return text
def pdf_pages_image(page,matrix_scale=3):
matrix=fitz.Matrix(matrix_scale,matrix_scale)
image=page.get_pixmap(matrix=matrix)
return Image.frombytes("RGB",[image.width,image.height],image.samples)
def pdf_extract(pdf_file):
pdf_text=''
pdf_doc=fitz.open(pdf_file)
num_pages=pdf_doc.page_count
print(num_pages)
with ThreadPoolExecutor(max_workers=10) as executor:
images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
print(len(images))
for image in images:
pdf_text+= pytesseract.image_to_string(image)
return pdf_text
def zip_extract(file):
text=''
# Open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Get list of all file names in the zip
for file_info in zip_ref.infolist():
print(f"File name: {file_info.filename}")
file_path=Path(file_info.filename)
print(file_path)
def parsing(text):
chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
# print(chat_prompt_message)
# Calling the LLM
response = get_completion(chat_prompt_message)
# Convert the result into the expected output format
print(response)
from langchain.output_parsers import OutputFixingParser
# fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
json_parser = JsonOutputParser()
output_dict=json_parser.invoke(response)
print(output_dict)
# output_dict=fix_parser.parse(response)
# output_dict = output_parser.parse(response) # type: ignore
print(type(output_dict))
return output_dict
def file_extract(file,extension):
text=''
if extension.lower() in ('.png', '.jpg', '.jpeg'):
text=img_extract(file)
elif extension.lower()=='.pdf':
print('pdf')
text=pdf_extract(file)
print(text)
return text
def get_completion(prompt):
ai_msg = llm.invoke(prompt)
return ai_msg.content
def correct_OCR(text):
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"},
{"role": "user", "content": f"Content:\n{text}"}
],
temperature=0.7,
)
ds_text = response.choices[0].message.content.strip()
return ds_text
except Exception as e:
print(f"Error while correcting OCR: {e}")
return text
def processing(filepath):
filename = os.path.basename(filepath)
extension = os.path.splitext(filepath)[1]
print(extension)
extracted_text = file_extract(filepath, extension)
corrected_text = correct_OCR(extracted_text)
parsed_dict = parsing(corrected_text) # dict
json_output = json.dumps(parsed_dict, indent=2) # For display purposes only
# Flatten the nested JSON and convert to DataFrame
df = pd.json_normalize(parsed_dict)
# Ensure 'Skills' column is in string format for CSV
df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
# Add the filename to the DataFrame
df['filename'] = filename
# Define consistent column order
desired_columns = [
"Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
"Personal Information.Phone", "Personal Information.Gmail",
"Personal Information.Address", "Personal Information.City",
"filename"
]
for col in desired_columns:
if col not in df.columns:
df[col] = None
df = df[desired_columns]
# Append to CSV only if Name is unique
if os.path.exists(csv_path):
existing_df = pd.read_csv(csv_path)
if df['Name'].iloc[0] in existing_df['Name'].values:
print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
return json_output
df.to_csv(csv_path, mode="a", index=False, header=False)
else:
df.to_csv(csv_path, mode="w", index=False, header=True)
return json_output
def resume_parser(filepath):
print(filepath)
if filepath:
ext = os.path.splitext(filepath)[1]
print(ext) # Output: .jpg
if ext=='.pdf' or ext=='docx':
print(filepath)
json_output=processing(filepath)
# # Push DataFrame into database as 'resume_data' table
gr.Info('Data moved to database')
return json_output
elif ext=='.zip':
zip_files = {}
# Create temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
print(f"Extracting to temp dir: {temp_dir}")
with zipfile.ZipFile(filepath, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
for file_info in zip_ref.infolist():
if file_info.is_dir():
continue
try:
extracted_file_path = os.path.join(temp_dir, file_info.filename)
print(f"Processing: {extracted_file_path}")
json_output = processing(extracted_file_path)
gr.Info(f'{file_info.filename} moved to database')
zip_files[file_info.filename] = 'processed'
except Exception as err:
print(str(err))
gr.Warning(f'{file_info.filename} not processed')
zip_files[file_info.filename] = 'not processed'
# 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up
return json.dumps(zip_files)
else:
raise gr.Error('No file selected')
def preprocess_skills(skill_text):
# Split based on comma, slash, or space (optional) and lowercase
if isinstance(skill_text, list):
return [s.strip().lower() for s in skill_text]
return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]
def get_filtered_rows(exp, skills_description):
try:
exp_filter = int(exp) if exp else None
user_skills = preprocess_skills(skills_description) if skills_description else []
# Load CSV instead of DB
df = pd.read_csv("./resumes.csv")
# Return message if no input provided
if not user_skills and exp_filter is None:
return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])
if user_skills:
def skill_match_ratio(candidate_skills):
candidate_list = preprocess_skills(candidate_skills)
matches = len(set(user_skills) & set(candidate_list))
return matches / len(user_skills) if user_skills else 0
df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
df = df[df["match_ratio"] >= 0.4] # prioritize skills match
# Now apply experience filter only if provided
if exp_filter is not None:
df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]
if not df.empty:
df = df.sort_values(
by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
ascending=[False, False]
)
return df.drop(columns=["match_ratio"], errors="ignore")
else:
return pd.DataFrame([{"Message": "No matching candidates found."}])
except Exception as e:
return pd.DataFrame([{"Error": str(e)}]) |