Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spacy | |
| import re | |
| import pdfplumber | |
| import docx | |
| import nltk | |
| from nltk.corpus import words | |
| from spacy.cli import download | |
| # Download the SpaCy model if it doesn't exist | |
| download("en_core_web_sm") | |
| # Load the spaCy model | |
| nlp = spacy.load("en_core_web_sm") | |
| # Set of English words | |
| nltk.download('words', quiet=True) | |
| english_words = set(words.words()) | |
| def extract_text(file): | |
| try: | |
| if file.name.endswith('.pdf'): | |
| return extract_text_from_pdf(file) | |
| elif file.name.endswith('.docx'): | |
| return extract_text_from_docx(file) | |
| else: | |
| return "Unsupported file format" | |
| except Exception as e: | |
| return f"Error extracting text: {str(e)}" | |
| def extract_text_from_pdf(file): | |
| text = '' | |
| with pdfplumber.open(file) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() or '' | |
| return text | |
| def extract_text_from_docx(file): | |
| doc = docx.Document(file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| def extract_companies(text): | |
| doc = nlp(text) | |
| companies = [] | |
| company_pattern = re.compile( | |
| r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE) | |
| for ent in doc.ents: | |
| if ent.label_ == "ORG" and company_pattern.search(ent.text): | |
| companies.append(ent.text) | |
| # Join companies with new lines | |
| return "\n".join(companies) | |
| def extract_colleges(text): | |
| doc = nlp(text) | |
| colleges = [] | |
| edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"] | |
| for sent in doc.sents: | |
| edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)] | |
| for edu in edu_ents: | |
| colleges.append(edu.text) | |
| # Join colleges with new lines | |
| return "\n".join(colleges) | |
| def extract_years_of_experience(text): | |
| years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE) | |
| months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE) | |
| total_years = sum(map(int, years)) | |
| total_months = sum(map(int, months)) | |
| total_experience_years = total_years + (total_months // 12) | |
| total_experience_months = total_months % 12 | |
| return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available" | |
| def extract_phone(text): | |
| phone_patterns = [ | |
| r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b', | |
| r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b' | |
| ] | |
| for pattern in phone_patterns: | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group() | |
| return "Not found" | |
| def extract_email(text): | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| match = re.search(email_pattern, text) | |
| return match.group() if match else "Not found" | |
| def extract_summary(doc): | |
| sentences = list(doc.sents) | |
| summary = [] | |
| for sent in sentences: | |
| if len(summary) >= 3: # Limit to 3 sentences | |
| break | |
| if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7: | |
| summary.append(sent.text) | |
| return " ".join(summary) | |
| def extract_linkedin(text): | |
| linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?' | |
| match = re.search(linkedin_pattern, text) | |
| return match.group() if match else "Not found" | |
| def parse_resume(file): | |
| try: | |
| text = extract_text(file) | |
| if text.startswith("Error") or text == "Unsupported file format": | |
| return {"Error": text} | |
| doc = nlp(text) | |
| companies = extract_companies(text) | |
| colleges = extract_colleges(text) | |
| years_of_experience = extract_years_of_experience(text) | |
| phone = extract_phone(text) | |
| email = extract_email(text) | |
| summary = extract_summary(doc) | |
| linkedin = extract_linkedin(text) | |
| return companies, colleges, years_of_experience, phone, email, summary, linkedin | |
| except Exception as e: | |
| import traceback | |
| return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
| # Create Gradio interface with separate output components | |
| iface = gr.Interface( | |
| fn=parse_resume, | |
| inputs=gr.File(label="Upload Resume (PDF or DOCX)"), | |
| outputs=[ | |
| gr.Textbox(label="Companies Worked For", lines=10), | |
| gr.Textbox(label="Colleges Attended", lines=10), | |
| gr.Textbox(label="Years of Experience"), | |
| gr.Textbox(label="Phone Number"), | |
| gr.Textbox(label="Email ID"), | |
| gr.Textbox(label="Summary", lines=3), | |
| gr.Textbox(label="LinkedIn ID") | |
| ], | |
| title="Advanced Resume Parser", | |
| description="Upload a resume in PDF or DOCX format to extract key information." | |
| ) | |
| iface.launch(share=True) | |