PPSA commited on
Commit
cb5cfea
·
verified ·
1 Parent(s): 023406a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +30 -0
  2. main.py +288 -0
  3. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main import resume_parser, get_filtered_rows
3
+
4
+ with gr.Blocks() as demo:
5
+ with gr.Tabs():
6
+ # Tab 1: File / Folder Upload
7
+ with gr.Tab("Resume Parser"):
8
+ gr.Markdown("### 📂 Upload Resume or Folder")
9
+ with gr.Row():
10
+ file_input = gr.File(label="Upload Resume (PDF, DOCX)")
11
+ folder_input = gr.FileExplorer(label="Select Folder")
12
+ parse_btn = gr.Button("Parse")
13
+ output_json = gr.Code(label="Parsed JSON", language="json")
14
+
15
+ parse_btn.click(fn=resume_parser, inputs=[file_input, folder_input], outputs=output_json)
16
+
17
+ # Tab 2: Text to DataFrame
18
+ with gr.Tab("Resume Filter"):
19
+ gr.Markdown("### 🔍 Resume Filter by Experience and Skills")
20
+ with gr.Row():
21
+ exp_input = gr.Number(label="Minimum Experience (Years)")
22
+ skills_input = gr.Textbox(label="Required Skills (comma-separated)", placeholder="e.g. Python, SQL, AWS")
23
+
24
+ submit_btn = gr.Button("Filter Resumes")
25
+ output = gr.Dataframe(label="Matching Candidates")
26
+
27
+ submit_btn.click(fn=get_filtered_rows, inputs=[exp_input, skills_input], outputs=output)
28
+
29
+ if __name__ == "__main__":
30
+ demo.launch(inbrowser=True)
main.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
2
+ import gradio as gr
3
+ import os, json
4
+ import cv2, pytesseract
5
+ import fitz
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from PIL import Image
8
+ from langchain_ollama import ChatOllama
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
12
+ # Import required libraries
13
+ from langchain_ollama import OllamaEmbeddings, OllamaLLM
14
+ from langchain_core.output_parsers import JsonOutputParser
15
+ # from langchain_text_splitters import CharacterTextSplitter
16
+ import chromadb
17
+ import shutil
18
+ import json
19
+ import pandas as pd
20
+ from langchain_core.documents import Document
21
+ from pathlib import Path
22
+ import regex as re
23
+
24
+ from openai import OpenAI
25
+
26
+ review_template = """
27
+ You are extracting structured information from the given text.
28
+ ONLY use the information explicitly available in the text provided.
29
+ If a specific field cannot be extracted from the input text, respond with 'null'.
30
+
31
+ Input text:
32
+ {text}
33
+
34
+ Provide outputs in the following format:
35
+ {format_instructions}
36
+ """
37
+
38
+
39
+ # Projects Experience: Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.
40
+ # Projects Experience
41
+ name_schema = ResponseSchema(name="Name",
42
+ description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
43
+ job_role_schema = ResponseSchema(name="Job_role",
44
+ description="What is the job role the person is applying for?")
45
+ skills_schema = ResponseSchema(name="Skills",
46
+ description="All the skill in resume text and output them as a comma separated Python list.",type='list')
47
+ exp_schema = ResponseSchema(name="Experience",
48
+ description="How much experience in years he has in resume text which is a number",type='integer')
49
+ info_schema = ResponseSchema(name="Personal Information",
50
+ description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
51
+ prof_schema = ResponseSchema(name="Profile",
52
+ description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
53
+ linkedin_schema = ResponseSchema(name="Linkedin",
54
+ description="Linkedin link if available else unknown",type='string')
55
+ # proj_schema = ResponseSchema(name="Projects Experience",
56
+ # description="Give me summary of projects in the format of dictionary format as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')
57
+
58
+
59
+ csv_path='./resumes.csv'
60
+
61
+ # Initialize the DeepSeek client
62
+ client = OpenAI(
63
+ api_key='sk-02f34bd0ea4849e8a4232bc656e28727', # Replace with your DeepSeek API key
64
+ base_url="https://api.deepseek.com/v1",
65
+ )
66
+
67
+
68
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
69
+ # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
70
+ # encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
71
+ # )
72
+
73
+
74
+
75
+ from langchain_deepseek import ChatDeepSeek
76
+ import os
77
+ os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
78
+ llm = ChatDeepSeek(
79
+ model="deepseek-chat",
80
+ temperature=0,
81
+ max_tokens=None,
82
+ timeout=None,
83
+ max_retries=2,
84
+ # other params...
85
+ )
86
+
87
+ # embedding = OllamaEmbeddings(model="deepseek-chat")
88
+
89
+ chat_prompt = ChatPromptTemplate.from_template(review_template)
90
+ # chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)
91
+
92
+ # Specify the response schema all the attribute we are expecting
93
+ response_schemas = [name_schema,
94
+ job_role_schema,
95
+ skills_schema,
96
+ exp_schema,
97
+ info_schema,
98
+ prof_schema,
99
+ linkedin_schema]
100
+
101
+ # proj_schema
102
+
103
+ output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
104
+ format_instructions = output_parser.get_format_instructions()
105
+
106
+ def img_extract(img_file):
107
+ img = cv2.imread(img_file)
108
+ text = pytesseract.image_to_string(img)
109
+ # print(text)
110
+ return text
111
+
112
+ def pdf_pages_image(page,matrix_scale=3):
113
+ matrix=fitz.Matrix(matrix_scale,matrix_scale)
114
+ image=page.get_pixmap(matrix=matrix)
115
+ return Image.frombytes("RGB",[image.width,image.height],image.samples)
116
+
117
+ def pdf_extract(pdf_file):
118
+ pdf_text=''
119
+ pdf_doc=fitz.open(pdf_file)
120
+ num_pages=pdf_doc.page_count
121
+ print(num_pages)
122
+ with ThreadPoolExecutor(max_workers=10) as executor:
123
+ images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
124
+ print(len(images))
125
+ for image in images:
126
+ pdf_text+= pytesseract.image_to_string(image)
127
+ return pdf_text
128
+
129
+ def parsing(text):
130
+ chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
131
+ # print(chat_prompt_message)
132
+ # Calling the LLM
133
+ response = get_completion(chat_prompt_message)
134
+ # Convert the result into the expected output format
135
+ print(response)
136
+ from langchain.output_parsers import OutputFixingParser
137
+ # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
138
+ json_parser = JsonOutputParser()
139
+ output_dict=json_parser.invoke(response)
140
+ print(output_dict)
141
+ # output_dict=fix_parser.parse(response)
142
+ # output_dict = output_parser.parse(response) # type: ignore
143
+ print("Expected delivery in days: ",output_dict)
144
+ print(type(output_dict))
145
+ return output_dict
146
+
147
+
148
+
149
+ def file_extract(file,extension):
150
+ text=''
151
+ if extension.lower() in ('.png', '.jpg', '.jpeg'):
152
+ text=img_extract(file)
153
+ elif extension.lower()=='.pdf':
154
+ print('pdf')
155
+ text=pdf_extract(file)
156
+ print(text)
157
+
158
+ return text
159
+
160
+ def get_completion(prompt):
161
+ ai_msg = llm.invoke(prompt)
162
+ return ai_msg.content
163
+
164
+
165
+ def correct_OCR(text):
166
+ try:
167
+ response = client.chat.completions.create(
168
+ model="deepseek-chat",
169
+ messages=[
170
+ {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into respective headings to respective text"},
171
+ {"role": "user", "content": f"Content:\n{text}"}
172
+ ],
173
+ temperature=0.7,
174
+ )
175
+ ds_text = response.choices[0].message.content.strip()
176
+ return ds_text
177
+ except Exception as e:
178
+ print(f"Error while correcting OCR: {e}")
179
+ return text
180
+
181
+ def processing(filepath):
182
+ filename = os.path.basename(filepath)
183
+ extension = os.path.splitext(filepath)[1]
184
+ print(extension)
185
+
186
+ extracted_text = file_extract(filepath, extension)
187
+ corrected_text = correct_OCR(extracted_text)
188
+
189
+ parsed_dict = parsing(corrected_text) # dict
190
+ json_output = json.dumps(parsed_dict, indent=2) # For display purposes only
191
+
192
+ # Flatten the nested JSON and convert to DataFrame
193
+ df = pd.json_normalize(parsed_dict)
194
+
195
+ # Ensure 'Skills' column is in string format for CSV
196
+ df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
197
+
198
+ # Add the filename to the DataFrame
199
+ df['filename'] = filename
200
+
201
+ # Define consistent column order
202
+ desired_columns = [
203
+ "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
204
+ "Personal Information.Phone", "Personal Information.Gmail",
205
+ "Personal Information.Address", "Personal Information.City",
206
+ "filename"
207
+ ]
208
+ for col in desired_columns:
209
+ if col not in df.columns:
210
+ df[col] = None
211
+ df = df[desired_columns]
212
+
213
+ # Append to CSV only if Name is unique
214
+ if os.path.exists(csv_path):
215
+ existing_df = pd.read_csv(csv_path)
216
+ if df['Name'].iloc[0] in existing_df['Name'].values:
217
+ print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
218
+ return json_output
219
+ df.to_csv(csv_path, mode="a", index=False, header=False)
220
+ else:
221
+ df.to_csv(csv_path, mode="w", index=False, header=True)
222
+ return json_output
223
+
224
+
225
+ def resume_parser(filepath,folder):
226
+
227
+ if filepath:
228
+ print(filepath)
229
+ json_output=processing(filepath)
230
+ # Push DataFrame into database as 'resume_data' table
231
+ gr.Info('Data moved to database')
232
+ return json_output
233
+ elif folder:
234
+ print(folder)
235
+ files = folder[1:] # skip the first item (it's the folder path)
236
+
237
+ results = []
238
+ for file_path in files:
239
+ file_path=Path(file_path)
240
+ json_output=processing(file_path)
241
+ return gr.Info('Files moved to database')
242
+
243
+ else:
244
+ raise gr.Error('No file selected')
245
+
246
+
247
+ def preprocess_skills(skill_text):
248
+ # Split based on comma, slash, or space (optional) and lowercase
249
+ if isinstance(skill_text, list):
250
+ return [s.strip().lower() for s in skill_text]
251
+ return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]
252
+
253
+ def get_filtered_rows(exp, skills_description):
254
+ try:
255
+ exp_filter = int(exp) if exp else None
256
+ user_skills = preprocess_skills(skills_description) if skills_description else []
257
+
258
+ # Load CSV instead of DB
259
+ df = pd.read_csv("./resumes.csv")
260
+
261
+ # Return message if no input provided
262
+ if not user_skills and exp_filter is None:
263
+ return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])
264
+
265
+ if user_skills:
266
+ def skill_match_ratio(candidate_skills):
267
+ candidate_list = preprocess_skills(candidate_skills)
268
+ matches = len(set(user_skills) & set(candidate_list))
269
+ return matches / len(user_skills) if user_skills else 0
270
+
271
+ df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
272
+ df = df[df["match_ratio"] >= 0.4] # prioritize skills match
273
+
274
+ # Now apply experience filter only if provided
275
+ if exp_filter is not None:
276
+ df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]
277
+
278
+ if not df.empty:
279
+ df = df.sort_values(
280
+ by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
281
+ ascending=[False, False]
282
+ )
283
+ return df.drop(columns=["match_ratio"], errors="ignore")
284
+ else:
285
+ return pd.DataFrame([{"Message": "No matching candidates found."}])
286
+
287
+ except Exception as e:
288
+ return pd.DataFrame([{"Error": str(e)}])
requirements.txt ADDED
Binary file (66.5 kB). View file