Spaces:

bangaboy
/

resume_parser

Runtime error

App Files Files Community

bangaboy commited on Sep 17, 2024

Commit

ea88d49

verified ·

1 Parent(s): 4580e9c

Create app.py

Browse files

Files changed (1) hide show

app.py +221 -0

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+!pip install streamlit google-generativeai pymupdf pyngrok transformers spacy python-docx nltk dateparser
+!python -m spacy download en_core_web_sm
+!python -m nltk.downloader words
+%%writefile combined_resume_analyzer.py
+import google.generativeai as genai
+import fitz  # PyMuPDF for PDF text extraction
+import streamlit as st
+import spacy
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+from docx import Document
+import re
+from nltk.corpus import words
+import dateparser
+from datetime import datetime
+from pyngrok import ngrok
+import os
+# Load SpaCy model for dependency parsing
+nlp_spacy = spacy.load('en_core_web_sm')
+# Load the NER model
+tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
+model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
+nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+english_words = set(words.words())
+# Function to authenticate with Gemini API
+def authenticate_gemini(api_key):
+    try:
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
+        st.success("Gemini API successfully configured.")
+        return model
+    except Exception as e:
+        st.error(f"Error configuring Gemini API: {e}")
+        return None
+# Function to filter and refine extracted ORG entities
+def refine_org_entities(entities):
+    refined_entities = set()
+    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
+    for entity in entities:
+        if any(entity.endswith(suffix) for suffix in company_suffixes):
+            refined_entities.add(entity)
+        elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
+            refined_entities.add(entity)
+    return list(refined_entities)
+# Function to extract ORG entities using NER
+def extract_orgs(text):
+    ner_results = nlp_ner(text)
+    orgs = set()
+    for entity in ner_results:
+        if entity['entity_group'] == 'ORG':
+            orgs.add(entity['word'])
+    return refine_org_entities(orgs)
+# Extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    text = ""
+    for page_num in range(doc.page_count):
+        page = doc.load_page(page_num)
+        text += page.get_text()
+    return text
+# Extract text from DOCX
+def extract_text_from_doc(doc_file):
+    doc = Document(doc_file)
+    text = '\n'.join([para.text for para in doc.paragraphs])
+    return text
+# Summary generation function
+def generate_summary(text, model):
+    prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
+    try:
+        response = model.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        return f"Error generating summary: {str(e)}"
+# Additional resume parsing functions
+def extract_experience(doc):
+    experience = 0
+    for ent in doc.ents:
+        if ent.label_ == "DATE":
+            date = dateparser.parse(ent.text)
+            if date:
+                experience = max(experience, datetime.now().year - date.year)
+    return experience
+def extract_phone(text):
+    phone_patterns = [
+        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
+        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
+    ]
+    for pattern in phone_patterns:
+        match = re.search(pattern, text)
+        if match:
+            return match.group()
+    return "Not found"
+def extract_email(text):
+    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    match = re.search(email_pattern, text)
+    return match.group() if match else "Not found"
+def extract_colleges(doc):
+    colleges = set()
+    edu_keywords = ["university", "college", "institute", "school"]
+    for ent in doc.ents:
+        if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
+            colleges.add(ent.text)
+    return list(colleges)
+def extract_linkedin(text):
+    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
+    match = re.search(linkedin_pattern, text)
+    return match.group() if match else "Not found"
+# Main function to process the resume and return the analysis
+def main():
+    st.title("Comprehensive Resume Analyzer")
+    st.write("Upload a resume to extract information, generate a summary, and analyze details.")
+    # Input for API key
+    api_key = st.text_input("Enter your Google Gemini API key", type="password")
+    # File uploader for resume input
+    uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
+    if uploaded_file is not None and api_key:
+        try:
+            # Authenticate with Google Gemini API
+            model = authenticate_gemini(api_key)
+            if model is None:
+                return
+            # Extract text from the uploaded resume
+            file_ext = uploaded_file.name.split('.')[-1].lower()
+            if file_ext == 'pdf':
+                resume_text = extract_text_from_pdf(uploaded_file)
+            elif file_ext in ['docx', 'doc']:
+                resume_text = extract_text_from_doc(uploaded_file)
+            else:
+                st.error("Unsupported file format.")
+                return
+            if not resume_text.strip():
+                st.error("The resume appears to be empty.")
+                return
+            # Process the resume
+            doc = nlp_spacy(resume_text)
+            # Extract information
+            companies = extract_orgs(resume_text)
+            summary = generate_summary(resume_text, model)
+            experience = extract_experience(doc)
+            phone = extract_phone(resume_text)
+            email = extract_email(resume_text)
+            colleges = extract_colleges(doc)
+            linkedin = extract_linkedin(resume_text)
+            # Display results
+            st.subheader("Extracted Information")
+            st.write(f"*Years of Experience:* {experience}")
+            st.write("*Companies Worked For:*")
+            st.write(", ".join(companies))
+            st.write(f"*Phone Number:* {phone}")
+            st.write(f"*Email ID:* {email}")
+            st.write("*Colleges Attended:*")
+            st.write(", ".join(colleges))
+            st.write(f"*LinkedIn ID:* {linkedin}")
+            st.subheader("Generated Summary")
+            st.write(summary)
+        except Exception as e:
+            st.error(f"Error during processing: {e}")
+if __name__ == "__main__":
+        main()from pyngrok import ngrok
+# Set your authtoken
+ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM") # Replace YOUR_AUTHTOKEN
+# Terminate any running ngrok processes (if any)
+!pkill -f streamlit
+# Run Streamlit in the background
+# The 'port' option should be passed as a keyword argument to the 'ngrok.connect()' function.
+public_url = ngrok.connect(8501)
+print("Public URL:", public_url)
+# Launch Streamlit
+!streamlit run combined_resume_analyzer.py
+from pyngrok import ngrok
+# Set your authtoken
+ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM") # Replace YOUR_AUTHTOKEN
+# Terminate any running ngrok processes (if any)
+!pkill -f streamlit
+# Run Streamlit in the background
+# The 'port' option should be passed as a keyword argument to the 'ngrok.connect()' function.
+public_url = ngrok.connect(8501)
+print("Public URL:", public_url)
+# Launch Streamlit
+!streamlit run combined_resume_analyzer.py