Spaces:
Build error
Build error
| import streamlit as st | |
| from pymongo import MongoClient | |
| import fitz # PyMuPDF | |
| import ast | |
| import re | |
| from groq import Groq | |
| import concurrent.futures | |
| import pandas as pd | |
| import io | |
| import json | |
| import requests | |
| DB_NAME = 'akshansh_db' | |
| try: | |
| client = MongoClient('mongodb+srv://akshansh:HzLqyintpUfmcC4D@dev001.4fkwn.mongodb.net/') | |
| db = client[DB_NAME] | |
| collection = db['parsed_resume_streamlit'] | |
| print("MongoDB connection established.") | |
| except Exception as e: | |
| print(f"Error connecting to MongoDB: {e}") | |
| groq_api = "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" | |
| llmsherpa_api_url = " http://65.2.175.211:5010/api/parseDocument?renderFormat=all&applyOcr=yes" | |
| def sanitize_text(text): | |
| # Escape single quotes and other potentially problematic characters | |
| return text.replace("'", "\\'") | |
| def process_using_llm(text): | |
| try: | |
| sanitized_text = sanitize_text(text) | |
| client = Groq(api_key=groq_api) | |
| prompt=f""" | |
| 1. Given is the text content of a resume, please extract information from it and output the result in a dictionary format which is defined below along with the expected data structure, strictly adhere to the dictionary format given below, if any field is not present leave it empty. | |
| Note: 1. Do not skip any information and do not add any information which is not present in the input content. | |
| 2. In case of github urls, linkedin urls, email id, add only if the url is present else leave it empty. | |
| 3. For the work experience only the latest work experience is required that is the one which is presntly being done or done at the last. | |
| 4. In the format of extracted_content, do not give any other things, like comments or anything | |
| Input: {sanitized_text} | |
| Expected output format: "extracted_content: {{ | |
| 'name': 'String', | |
| 'email': 'String', | |
| 'phone': 'String', | |
| 'location': 'String', | |
| 'linkedin': 'String', | |
| 'github':'String', | |
| 'inter_personal_skills': [ | |
| 'String' | |
| ], | |
| 'technical_skills': [ | |
| 'String' | |
| ], | |
| 'soft_skills':[ | |
| 'String' | |
| ], | |
| 'programming_languages':[ | |
| 'String' | |
| ], | |
| 'linguistic_languages':[ | |
| 'String' | |
| ], | |
| 'latest_work_experience':{{ | |
| 'company': 'String', | |
| 'role': 'String', | |
| 'duration': 'String', | |
| 'work_location': 'String', | |
| }}, | |
| 'graduation_details':{{ | |
| 'course':'String', | |
| 'institution':'String', | |
| 'course_type':'String', | |
| 'year_of_graduation':'String', | |
| 'percentage_or_cgpa':'String' | |
| }}, | |
| 'higher_secondary_education':{{ | |
| 'institution':'String', | |
| 'education_board_type':'String', | |
| 'year_of_completion':'String', | |
| 'percentage_or_cgpa':'String' | |
| }}, | |
| 'secondary_education':{{ | |
| 'institution':'String', | |
| 'education_board_type':'String', | |
| 'year_of_completion':'String', | |
| 'percentage_or_cgpa':'String' | |
| }} | |
| }}" | |
| """ | |
| chat_completion = client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| model="llama3-70b-8192" | |
| ) | |
| return chat_completion.choices[0].message.content | |
| except Exception as e: | |
| print(f"An error occurred in LLM part: {e}") | |
| return None | |
| def extract(output): | |
| match = re.search(r'extracted_content:\s*(\{.*\})', output, re.DOTALL) | |
| if match: | |
| extracted_content = match.group(1) | |
| return ast.literal_eval(extracted_content) | |
| else: | |
| print("No extracted content found in parsing llm's output") | |
| return {} | |
| def process_resume(pdf_content): | |
| response = requests.post(llmsherpa_api_url, files={'file': ('resume.pdf', pdf_content, 'application/pdf')}) | |
| # Check if the response is valid JSON | |
| try: | |
| response_json = response.json() | |
| except json.JSONDecodeError: | |
| print("Failed to decode JSON response") | |
| return None | |
| if 'return_dict' in response_json and 'result' in response_json['return_dict']: | |
| blocks = response_json['return_dict']['result']['blocks'] | |
| content="" | |
| for block in blocks: | |
| tag=block.get('tag',None) | |
| if tag=="table": | |
| table_rows=block['table_rows'] | |
| for row in table_rows: | |
| cells=row.get('cells',None) | |
| if cells: | |
| cells=row['cells'] | |
| for cell in cells: | |
| value=cell['cell_value'] | |
| if isinstance(value,dict): | |
| sentences=value.get('sentences',None) | |
| for sentence in sentences: | |
| content+=sentence+'\n' | |
| elif value !='': | |
| content+=value+'\n' | |
| else: | |
| value=row.get('cell_value',None) | |
| if value: | |
| content+=value+'\n' | |
| else: | |
| sentences=block.get('sentences') | |
| for s in sentences: | |
| content+=s+'\n' | |
| if content: | |
| result = {} | |
| processed_text = process_using_llm(content) | |
| if processed_text: | |
| extracted_output = extract(processed_text) | |
| result=extracted_output | |
| return result | |
| def json_to_excel(data): # data is a list of JSON | |
| try: | |
| # Define the specific order of columns | |
| column_order = [ | |
| 'Name', 'Phone', 'Location', 'Email', 'Linkedin', 'Github', | |
| 'Graduation Details', 'Graduation Institution', 'Graduation Course Type', | |
| 'Year of Graduation', 'Aggregate Percentage in Graduation', | |
| 'Higher Secondary Institute Name', 'Higher Secondary Education Board Type', | |
| 'Year of Completion of Higher Secondary Education', | |
| 'Aggregate Percentage in Higher Secondary Education', | |
| 'Secondary Education Institute Name', 'Secondary Education Board Type', | |
| 'Year of Completion of Secondary Education', 'Aggregate Percentage in Secondary Education', | |
| 'Current Working Organization', 'Current Designation', 'Current Work Duration', | |
| 'Current Work Location', 'Inter Personal Skills', 'Technical Skills', | |
| 'Soft Skills', 'Programming Languages', 'Languages' | |
| ] | |
| flat_data = [] | |
| for item in data: | |
| flat_item = {} | |
| if "name" in item: | |
| name = item.get("name", None) | |
| if name: | |
| flat_item['Name'] = name | |
| if "phone" in item: | |
| phone = item.get('phone', None) | |
| if phone: | |
| flat_item['Phone'] = phone | |
| if "location" in item: | |
| location = item.get("location", None) | |
| if location: | |
| flat_item['Location'] = location | |
| if "email" in item: | |
| email = item.get("email", None) | |
| if email: | |
| flat_item['Email'] = email | |
| if "linkedin" in item: | |
| linkedin = item.get('linkedin', None) | |
| if linkedin: | |
| flat_item['Linkedin'] = linkedin | |
| if 'github' in item: | |
| github = item.get('github', None) | |
| if github: | |
| flat_item['Github'] = github | |
| if "graduation_details" in item: | |
| ed = item["graduation_details"] | |
| course = ed.get('course', None) | |
| if course: | |
| flat_item['Graduation Details'] = course | |
| institution = ed.get('institution', None) | |
| if institution: | |
| flat_item['Graduation Institution'] = institution | |
| course_type = ed.get('course_type', None) | |
| if course_type: | |
| flat_item['Graduation Course Type'] = course_type | |
| year = ed.get('year_of_graduation', None) | |
| if year: | |
| flat_item['Year of Graduation'] = year | |
| marks = ed.get('percentage_or_cgpa', None) | |
| if marks: | |
| flat_item['Aggregate Percentage in Graduation'] = marks | |
| if "higher_secondary_education" in item: | |
| ed = item.get('higher_secondary_education') | |
| institution = ed.get('institution', None) | |
| if institution: | |
| flat_item['Higher Secondary Institute Name'] = institution | |
| board = ed.get('education_board_type', None) | |
| if board: | |
| flat_item['Higher Secondary Education Board Type'] = board | |
| year = ed.get('year_of_completion', None) | |
| if year: | |
| flat_item['Year of Completion of Higher Secondary Education'] = year | |
| marks = ed.get('percentage_or_cgpa', None) | |
| if marks: | |
| flat_item['Aggregate Percentage in Higher Secondary Education'] = marks | |
| if "secondary_education" in item: | |
| ed = item.get('secondary_education') | |
| institution = ed.get('institution', None) | |
| if institution: | |
| flat_item['Secondary Education Institute Name'] = institution | |
| board = ed.get('education_board_type', None) | |
| if board: | |
| flat_item['Secondary Education Board Type'] = board | |
| year = ed.get('year_of_completion', None) | |
| if year: | |
| flat_item['Year of Completion of Secondary Education'] = year | |
| marks = ed.get('percentage_or_cgpa', None) | |
| if marks: | |
| flat_item['Aggregate Percentage in Secondary Education'] = marks | |
| if 'latest_work_experience' in item: | |
| current_work = item.get('latest_work_experience', None) | |
| if current_work: | |
| company = current_work.get('company', None) | |
| if company: | |
| flat_item['Current Working Organization'] = company | |
| role = current_work.get('role', None) | |
| if role: | |
| flat_item['Current Designation'] = role | |
| duration = current_work.get('duration', None) | |
| if duration: | |
| flat_item['Current Work Duration'] = duration | |
| location = current_work.get('work_location', None) | |
| if location: | |
| flat_item['Current Work Location'] = location | |
| if "inter_personal_skills" in item: | |
| flat_item["Inter Personal Skills"] = ", ".join(item["inter_personal_skills"]) | |
| if "technical_skills" in item: | |
| flat_item["Technical Skills"] = ", ".join(item["technical_skills"]) | |
| if "soft_skills" in item: | |
| flat_item["Soft Skills"] = ", ".join(item["soft_skills"]) | |
| if "programming_languages" in item: | |
| flat_item["Programming Languages"] = ", ".join(item["programming_languages"]) | |
| if "linguistic_languages" in item: | |
| flat_item["Languages"] = ", ".join(item["linguistic_languages"]) | |
| flat_data.append(flat_item) | |
| # Create DataFrame | |
| df = pd.DataFrame(flat_data) | |
| # Reorder columns according to the specified order | |
| df = df[[col for col in column_order if col in df.columns]] | |
| return df | |
| except Exception as e: | |
| print(f"Error occurred in converting JSON to Excel: {e}") | |
| return None | |
| def main(): | |
| st.title('Resume Parser') | |
| # Allow the user to specify the maximum number of resumes to upload | |
| max_resumes = st.number_input("Maximum number of resumes to upload, limit: 5", min_value=1, max_value=5, value=1, step=1) | |
| # Allow the user to upload the resumes | |
| uploaded_files = st.file_uploader("Upload your resumes", type=["pdf"], accept_multiple_files=True) | |
| if uploaded_files: | |
| if len(uploaded_files) != max_resumes: | |
| st.warning(f"Please upload exactly {max_resumes} resumes.") | |
| else: | |
| submit_button = st.button("Process Resumes") | |
| if submit_button: | |
| try: | |
| with st.spinner("Your resumes are being processed..."): | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| # Reading the PDF content for each uploaded file | |
| pdf_contents = [file.read() for file in uploaded_files[:max_resumes]] | |
| # Process each PDF content using the process_resume function | |
| results = list(executor.map(process_resume, pdf_contents)) | |
| successful_resumes = [] | |
| failed_resumes_count = 0 | |
| for result in results: | |
| if result: | |
| successful_resumes.append(result) | |
| collection.insert_one(result) | |
| else: | |
| failed_resumes_count += 1 | |
| if successful_resumes: | |
| if failed_resumes_count > 0: | |
| st.warning(f"{failed_resumes_count} resumes could not be processed. Do you still want to download the successfully processed resumes?") | |
| user_response = st.radio("Please select:", ("Yes", "No")) | |
| if user_response == "Yes": | |
| # Convert the processed resume data to a pandas DataFrame | |
| df = json_to_excel(successful_resumes) | |
| if df is not None: | |
| # Create an Excel file in memory | |
| excel_file = io.BytesIO() | |
| with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, index=False, sheet_name='Resumes') | |
| st.download_button( | |
| label="Download XLSX file", | |
| data=excel_file.getvalue(), | |
| file_name="resume_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| ) | |
| else: | |
| st.error("Aw! Snap, could not process any of the resumes. Please try again later.") | |
| elif user_response == "No": | |
| st.info("Then try again after some time.") | |
| else: | |
| # Convert the processed resume data to a pandas DataFrame | |
| df = json_to_excel(successful_resumes) | |
| if df is not None: | |
| # Create an Excel file in memory | |
| excel_file = io.BytesIO() | |
| with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, index=False, sheet_name='Resumes') | |
| st.download_button( | |
| label="Download XLSX file", | |
| data=excel_file.getvalue(), | |
| file_name="resume_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| ) | |
| st.success(f"Resumes processed successfully! {len(successful_resumes)} out of {max_resumes} resumes processed.") | |
| else: | |
| st.error("Aw! Snap, could not process any of the resumes. Please try again later.") | |
| else: | |
| st.error("Aw! Snap, could not process any of the resumes. Please try again later.") | |
| except Exception as e: | |
| st.error("Aw! Snap, could not process your resumes. Please try again later.") | |
| print(f"Error processing resumes: {e}") | |
| if __name__ == "__main__": | |
| main() | |