Spaces:

Prernas19
/

resume_parser

Sleeping

App Files Files Community

Prernas19 commited on Aug 10, 2024

Commit

66dea93

verified ·

1 Parent(s): 2928099

Create app.py

Browse files

Files changed (1) hide show

app.py +148 -0

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import spacy
+import re
+import pdfplumber
+import docx
+import nltk
+from nltk.corpus import words
+# Load the spaCy model
+nlp = spacy.load("en_core_web_sm")
+# Set of English words
+nltk.download('words', quiet=True)
+english_words = set(words.words())
+def extract_text(file):
+    try:
+        if file.name.endswith('.pdf'):
+            return extract_text_from_pdf(file)
+        elif file.name.endswith('.docx'):
+            return extract_text_from_docx(file)
+        else:
+            return "Unsupported file format"
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def extract_text_from_pdf(file):
+    text = ''
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() or ''
+    return text
+def extract_text_from_docx(file):
+    doc = docx.Document(file)
+    return "\n".join([para.text for para in doc.paragraphs])
+def extract_companies(text):
+    doc = nlp(text)
+    companies = []
+    company_pattern = re.compile(
+         r'\b(?:Inc\.|Corp\.|LLC|Ltd\.|Co\.|Company|Group|Services|Technologies|Pvt\.|Solutions|Consulting|Associates|Enterprises|Partners|Holdings|Systems|Networks|Ventures|Partners|International|Ltd|GmbH|S\.A\.|S\.L\.|LLP|PLC|AG|LLC)\b', re.IGNORECASE)
+    for ent in doc.ents:
+        if ent.label_ == "ORG" and company_pattern.search(ent.text):
+            companies.append(ent.text)
+    # Join companies with new lines
+    return "\n".join(companies)
+def extract_colleges(text):
+    doc = nlp(text)
+    colleges = []
+    edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
+    for sent in doc.sents:
+        edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
+        for edu in edu_ents:
+            colleges.append(edu.text)
+    # Join colleges with new lines
+    return "\n".join(colleges)
+def extract_years_of_experience(text):
+    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
+    months = re.findall(r'(\d+)\s+month[s]*', text, re.IGNORECASE)
+    total_years = sum(map(int, years))
+    total_months = sum(map(int, months))
+    total_experience_years = total_years + (total_months // 12)
+    total_experience_months = total_months % 12
+    return f"{total_experience_years} years and {total_experience_months} months" if total_experience_years or total_experience_months else "Not available"
+def extract_phone(text):
+    phone_patterns = [
+        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
+        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
+    ]
+    for pattern in phone_patterns:
+        match = re.search(pattern, text)
+        if match:
+            return match.group()
+    return "Not found"
+def extract_email(text):
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    match = re.search(email_pattern, text)
+    return match.group() if match else "Not found"
+def extract_summary(doc):
+    sentences = list(doc.sents)
+    summary = []
+    for sent in sentences:
+        if len(summary) >= 3:  # Limit to 3 sentences
+            break
+        if len(sent.text.split()) > 5 and sum(1 for word in sent.text.split() if word.lower() in english_words) / len(sent.text.split()) > 0.7:
+            summary.append(sent.text)
+    return " ".join(summary)
+def extract_linkedin(text):
+    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
+    match = re.search(linkedin_pattern, text)
+    return match.group() if match else "Not found"
+def parse_resume(file):
+    try:
+        text = extract_text(file)
+        if text.startswith("Error") or text == "Unsupported file format":
+            return {"Error": text}
+        doc = nlp(text)
+        companies = extract_companies(text)
+        colleges = extract_colleges(text)
+        years_of_experience = extract_years_of_experience(text)
+        phone = extract_phone(text)
+        email = extract_email(text)
+        summary = extract_summary(doc)
+        linkedin = extract_linkedin(text)
+        return companies, colleges, years_of_experience, phone, email, summary, linkedin
+    except Exception as e:
+        import traceback
+        return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+# Create Gradio interface with separate output components
+iface = gr.Interface(
+    fn=parse_resume,
+    inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
+    outputs=[
+        gr.Textbox(label="Companies Worked For", lines=10),
+        gr.Textbox(label="Colleges Attended", lines=10),
+        gr.Textbox(label="Years of Experience"),
+        gr.Textbox(label="Phone Number"),
+        gr.Textbox(label="Email ID"),
+        gr.Textbox(label="Summary", lines=3),
+        gr.Textbox(label="LinkedIn ID")
+    ],
+    title="Advanced Resume Parser",
+    description="Upload a resume in PDF or DOCX format to extract key information."
+)
+iface.launch(share=True)