File size: 12,614 Bytes
074b364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3013983
 
074b364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3013983
 
 
 
 
 
 
 
 
 
074b364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3013983
 
074b364
3013983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
074b364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb5cfea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
import gradio as gr
import os, json
import cv2, pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
# Import required libraries
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_core.output_parsers import JsonOutputParser
# from langchain_text_splitters import CharacterTextSplitter
import chromadb
import shutil
import json
import pandas as pd
from langchain_core.documents import Document
from pathlib import Path
import regex as re
import tempfile
import zipfile
from openai import OpenAI

review_template = """
You are extracting structured information from the given text. 
ONLY use the information explicitly available in the text provided. 
If a specific field cannot be extracted from the input text, respond with 'null'.

Input text:
{text}

Provide outputs in the following format:
{format_instructions}
"""


# Projects Experience: Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.
# Projects Experience
name_schema = ResponseSchema(name="Name",
                         description="Name of the person in resume text applying for job? Answer noun as string or unknown.")
job_role_schema = ResponseSchema(name="Job_role",
                                    description="What is the job role the person is applying for?")
skills_schema = ResponseSchema(name="Skills",
                                    description="All the skill in resume text  and output them as a comma separated Python list.",type='list')
exp_schema = ResponseSchema(name="Experience",
                                    description="How much experience in years he has in resume text which is a number",type='integer')
info_schema = ResponseSchema(name="Personal Information",
                                    description="Extract the information of the person like Phone number, Address, City, Gender, Gmail and extract and save it in dictionary as key and values.",type='dictionary')
prof_schema = ResponseSchema(name="Profile",
                                    description="What is his profile he is mentioned in text for the job application and summarize it.",type='string')
linkedin_schema = ResponseSchema(name="Linkedin",
                                    description="Linkedin link if available else unknown",type='string')
# proj_schema = ResponseSchema(name="Projects Experience",
#                                     description="Give me summary of projects in the format of dictionary format  as keys as Project name, Time period he did the project, and summary of project in bullet points.",type='dictionary')


csv_path='./resumes.csv'

# Initialize the DeepSeek client
client = OpenAI(
    api_key='sk-02f34bd0ea4849e8a4232bc656e28727',  # Replace with your DeepSeek API key
    base_url="https://api.deepseek.com/v1",
)


# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\ashasrikar.paritala\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#     encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0
# )



from langchain_deepseek import ChatDeepSeek
import os
os.environ["DEEPSEEK_API_KEY"]='sk-02f34bd0ea4849e8a4232bc656e28727'
llm = ChatDeepSeek(
    model="deepseek-chat",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

# embedding = OllamaEmbeddings(model="deepseek-chat")

chat_prompt = ChatPromptTemplate.from_template(review_template)
# chat_prompt_message = chat_prompt.format_messages(text=pdf_text, format_instructions=format_instructions)

# Specify the response schema all the attribute we are expecting
response_schemas = [name_schema, 
                    job_role_schema,
                    skills_schema,
                   exp_schema,
                   info_schema,
                   prof_schema,
                   linkedin_schema]

# proj_schema

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

def img_extract(img_file):
    img = cv2.imread(img_file)  
    text = pytesseract.image_to_string(img)
    # print(text)
    return text

def pdf_pages_image(page,matrix_scale=3):
    matrix=fitz.Matrix(matrix_scale,matrix_scale)
    image=page.get_pixmap(matrix=matrix)
    return Image.frombytes("RGB",[image.width,image.height],image.samples)

def pdf_extract(pdf_file): 
    pdf_text=''
    pdf_doc=fitz.open(pdf_file)
    num_pages=pdf_doc.page_count
    print(num_pages)
    with ThreadPoolExecutor(max_workers=10) as executor:
        images=list(executor.map(lambda page:pdf_pages_image(pdf_doc[page],5),range(num_pages)))
        print(len(images))
        for image in images:
            pdf_text+= pytesseract.image_to_string(image)
    return pdf_text

def zip_extract(file):
    text=''
    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of all file names in the zip
        for file_info in zip_ref.infolist():
            print(f"File name: {file_info.filename}")
            file_path=Path(file_info.filename)
            print(file_path)

def parsing(text):
    chat_prompt_message = chat_prompt.format_messages(text=text, format_instructions=format_instructions)
    # print(chat_prompt_message)
    # Calling the LLM
    response = get_completion(chat_prompt_message)
    # Convert the result into the expected output format
    print(response)
    from langchain.output_parsers import OutputFixingParser
    # fix_parser = OutputFixingParser.from_llm(parser=output_parser, llm=llm)
    json_parser = JsonOutputParser()
    output_dict=json_parser.invoke(response)
    print(output_dict)
    # output_dict=fix_parser.parse(response)
    # output_dict = output_parser.parse(response) # type: ignore
    print(type(output_dict))
    return output_dict
    


def file_extract(file,extension):
    text=''
    if extension.lower() in ('.png', '.jpg', '.jpeg'):
        text=img_extract(file)
    elif extension.lower()=='.pdf':
        print('pdf')
        text=pdf_extract(file)
    print(text)

    return text

def get_completion(prompt):
    ai_msg = llm.invoke(prompt)
    return ai_msg.content


def correct_OCR(text):
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that checks and corrects spelling mistakes in defective OCR text. Understanding layout of text and please reorganize the text into  respective headings to respective text"},
                {"role": "user", "content": f"Content:\n{text}"}
            ],
            temperature=0.7,
        )
        ds_text = response.choices[0].message.content.strip()
        return ds_text
    except Exception as e:
        print(f"Error while correcting OCR: {e}")
        return text

def processing(filepath):
    filename = os.path.basename(filepath)
    extension = os.path.splitext(filepath)[1]
    print(extension)

    extracted_text = file_extract(filepath, extension)
    corrected_text = correct_OCR(extracted_text)
    
    parsed_dict = parsing(corrected_text)  # dict
    json_output = json.dumps(parsed_dict, indent=2)  # For display purposes only
    
    # Flatten the nested JSON and convert to DataFrame
    df = pd.json_normalize(parsed_dict)
    
    # Ensure 'Skills' column is in string format for CSV
    df['Skills'] = df['Skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    
    # Add the filename to the DataFrame
    df['filename'] = filename
    
    # Define consistent column order
    desired_columns = [
        "Name", "Job_role", "Experience", "Skills", "Profile", "Linkedin",
        "Personal Information.Phone", "Personal Information.Gmail",
        "Personal Information.Address", "Personal Information.City",
        "filename"
    ]
    for col in desired_columns:
        if col not in df.columns:
            df[col] = None
    df = df[desired_columns]

    # Append to CSV only if Name is unique
    if os.path.exists(csv_path):
        existing_df = pd.read_csv(csv_path)
        if df['Name'].iloc[0] in existing_df['Name'].values:
            print(f"Duplicate entry found for Name: {df['Name'].iloc[0]}. Skipping...")
            return json_output
        df.to_csv(csv_path, mode="a", index=False, header=False)
    else:
        df.to_csv(csv_path, mode="w", index=False, header=True)
    return json_output
        

def resume_parser(filepath):
    print(filepath)
    if filepath:
        ext = os.path.splitext(filepath)[1]
        print(ext)  # Output: .jpg
        if ext=='.pdf' or ext=='docx':
            print(filepath)
            json_output=processing(filepath)     
            # # Push DataFrame into database as 'resume_data' table
            gr.Info('Data moved to database')
            return json_output
        elif ext=='.zip':
            zip_files = {}

            # Create temporary directory
            with tempfile.TemporaryDirectory() as temp_dir:
                print(f"Extracting to temp dir: {temp_dir}")
                
                with zipfile.ZipFile(filepath, 'r') as zip_ref:
                    zip_ref.extractall(temp_dir)

                    for file_info in zip_ref.infolist():
                        if file_info.is_dir():
                            continue
                        
                        try:
                            extracted_file_path = os.path.join(temp_dir, file_info.filename)
                            print(f"Processing: {extracted_file_path}")
                            
                            json_output = processing(extracted_file_path)
                            gr.Info(f'{file_info.filename} moved to database')
                            zip_files[file_info.filename] = 'processed'
                        
                        except Exception as err:
                            print(str(err))
                            gr.Warning(f'{file_info.filename} not processed')
                            zip_files[file_info.filename] = 'not processed'

            # 🔥 No need to manually delete, `TemporaryDirectory()` auto-cleans up
            return json.dumps(zip_files)

    else:
        raise gr.Error('No file selected')


def preprocess_skills(skill_text):
    # Split based on comma, slash, or space (optional) and lowercase
    if isinstance(skill_text, list):
        return [s.strip().lower() for s in skill_text]
    return [s.strip().lower() for s in re.split(r"[,/|&\-\s]+", str(skill_text)) if s.strip()]

def get_filtered_rows(exp, skills_description):
    try:
        exp_filter = int(exp) if exp else None
        user_skills = preprocess_skills(skills_description) if skills_description else []

        # Load CSV instead of DB
        df = pd.read_csv("./resumes.csv")

        # Return message if no input provided
        if not user_skills and exp_filter is None:
            return pd.DataFrame([{"Message": "Please enter Experience and/or Skills to filter."}])

        if user_skills:
            def skill_match_ratio(candidate_skills):
                candidate_list = preprocess_skills(candidate_skills)
                matches = len(set(user_skills) & set(candidate_list))
                return matches / len(user_skills) if user_skills else 0

            df["match_ratio"] = df["Skills"].apply(skill_match_ratio)
            df = df[df["match_ratio"] >= 0.4]  # prioritize skills match

        # Now apply experience filter only if provided
        if exp_filter is not None:
            df = df[(df["Experience"].isna()) | (df["Experience"] >= exp_filter)]

        if not df.empty:
            df = df.sort_values(
                by=["match_ratio" if "match_ratio" in df.columns else "Experience", "Experience"],
                ascending=[False, False]
            )
            return df.drop(columns=["match_ratio"], errors="ignore")
        else:
            return pd.DataFrame([{"Message": "No matching candidates found."}])

    except Exception as e:
        return pd.DataFrame([{"Error": str(e)}])