Spaces:

bangaboy
/

pp

Sleeping

App Files Files Community

bangaboy commited on Sep 8, 2025

Commit

0132466

verified ·

1 Parent(s): 34860cb

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +128 -180

src/streamlit_app.py CHANGED Viewed

@@ -1,251 +1,199 @@
 import os
-os.environ["STREAMLIT_HOME"] = "/app/.streamlit"
 import streamlit as st
 import google.generativeai as genai
 from PIL import Image
 import fitz  # PyMuPDF
 from docx import Document
 import json
 from pathlib import Path
 from datetime import datetime
 import re
-import pytesseract
-import io
 def extract_text_from_pdf(pdf_file):
-    """Extract text from uploaded PDF file."""
     text_content = []
     try:
-        pdf_bytes = pdf_file.read()
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            text_content.append(page.get_text())
         return "\n".join(text_content)
     except Exception as e:
-        st.error(f"Error in PDF extraction: {str(e)}")
         return ""
 def extract_text_from_docx(docx_file):
-    """Extract text from uploaded DOCX file."""
     try:
         doc = Document(docx_file)
-        text_content = []
-        for paragraph in doc.paragraphs:
-            text_content.append(paragraph.text)
-        return "\n".join(text_content)
     except Exception as e:
-        st.error(f"Error in DOCX extraction: {str(e)}")
         return ""
 def parse_date(date_str):
-    """Parse date from various formats."""
     try:
-        # Handle 'Present' or 'Current'
-        if date_str.lower() in ['present', 'current', 'now']:
             return datetime.now()
         date_str = date_str.strip()
-        formats = [
-            '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
-            '%Y/%m', '%Y-%m'
-        ]
         for fmt in formats:
             try:
                 return datetime.strptime(date_str, fmt)
-            except ValueError:
                 continue
-        year_match = re.search(r'\b20\d{2}\b', date_str)
         if year_match:
-            return datetime.strptime(year_match.group(), '%Y')
         return None
-    except Exception:
         return None
 def calculate_experience(work_history):
-    """Calculate total years of experience from work history."""
-    total_experience = 0
-    current_year = datetime.now().year
     for job in work_history:
-        duration = job.get('duration', '')
         if not duration:
             continue
-        parts = re.split(r'\s*-\s*|\s+to\s+', duration)
         if len(parts) != 2:
             continue
-        start_date = parse_date(parts[0])
-        end_date = parse_date(parts[1])
-        if start_date and end_date:
-            years = (end_date.year - start_date.year) + \
-                   (end_date.month - start_date.month) / 12
-            total_experience += max(0, years)
-    return round(total_experience, 1)
 def parse_resume(file_uploaded, api_key):
-    """Parse resume and extract information."""
     genai.configure(api_key=api_key)
-    model = genai.GenerativeModel('gemini-1.5-flash')
     prompt = """Extract the following information from this resume:
-    1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications
-    2. Full Name
-    3. Email Address
-    4. Phone Number
-    5. Education History (including degree, institution, graduation year, and field of study)
-    6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020")
-    7. Skills
-    8. LinkedIn Profile URL
-    Return the information in this JSON format:
-    {
-        "summary": "",
-        "name": "",
-        "email": "",
-        "phone": "",
-        "education": [
-            {
-                "degree": "",
-                "institution": "",
-                "year": "",
-                "field": "",
-                "gpa": ""
-            }
-        ],
-        "work_experience": [
-            {
-                "company": "",
-                "position": "",
-                "duration": ""
-            }
-        ],
-        "skills": [],
-        "linkedin": ""
-    }
-    For skills include tools and technologies in output if present any in resume.
-    For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending.
-    Only return the JSON object, nothing else. If any field is not found, leave it empty."""
-    try:
-        file_extension = Path(file_uploaded.name).suffix.lower()
-        if file_extension == '.pdf':
-            text_content = extract_text_from_pdf(file_uploaded)
-        elif file_extension in ['.docx', '.doc']:
-            text_content = extract_text_from_docx(file_uploaded)
-        elif file_extension in ['.jpg', '.jpeg', '.png']:
-            image = Image.open(file_uploaded)
-            text_content = pytesseract.image_to_string(image)
-        else:
-            st.error(f"Unsupported file format: {file_extension}")
-            return None
         response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
-        try:
-            response_text = response.text
-            json_start = response_text.find('{')
-            json_end = response_text.rfind('}') + 1
-            json_str = response_text[json_start:json_end]
-            result = json.loads(json_str)
-            total_exp = calculate_experience(result.get('work_experience', []))
-            result['total_years_experience'] = total_exp
-            return result
-        except json.JSONDecodeError as e:
-            st.error(f"Error parsing response: {str(e)}")
-            return None
     except Exception as e:
-        st.error(f"Error processing resume: {str(e)}")
         return None
 def format_education(edu):
-    """Format education details for display."""
     parts = []
-    if edu.get('degree'):
-        parts.append(edu['degree'])
-    if edu.get('field'):
         parts.append(f"in {edu['field']}")
-    if edu.get('institution'):
         parts.append(f"from {edu['institution']}")
-    if edu.get('year'):
         parts.append(f"({edu['year']})")
-    if edu.get('gpa') and edu['gpa'].strip():
         parts.append(f"- GPA: {edu['gpa']}")
     return " ".join(parts)
 def main():
-    st.title("Resume Parser")
-    st.write("Upload a resume (PDF, DOCX, or Image) to extract information")
-    # Get API key from secrets or user input
     api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
-    uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"])
     if uploaded_file and api_key:
-        with st.spinner('Analyzing resume...'):
             result = parse_resume(uploaded_file, api_key)
-            if result:
-                st.subheader("Extracted Information")
-                # Display summary in a text area
-                st.text_area("Summary", result.get('summary', 'Not found'), height=100)
-                # Display personal information
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    st.write("**Name:**", result.get('name', 'Not found'))
-                with col2:
-                    st.write("**Email:**", result.get('email', 'Not found'))
-                with col3:
-                    st.write("**Phone:**", result.get('phone', 'Not found'))
-                # Display total experience
-                total_exp = result.get('total_years_experience', 0)
-                exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
-                st.write("**Total Experience:**", exp_text)
-                # Display education
-                st.subheader("Education")
-                if result.get('education'):
-                    for edu in result['education']:
-                        st.write(f"- {format_education(edu)}")
-                else:
-                    st.write("No education information found")
-                # Display work experience
-                st.subheader("Work Experience")
-                if result.get('work_experience'):
-                    for exp in result['work_experience']:
-                        duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
-                        st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}")
-                else:
-                    st.write("No work experience found")
-                # Display Skills
-                st.subheader("Skills:")
-                if result.get('skills'):
-                    for skill in result['skills']:
-                        st.write(f"- {skill}")
-                else:
-                    st.write("- No skills found")
-                # Display LinkedIn profile
-                st.write("**LinkedIn Profile:**", result.get('linkedin', 'Not found'))
 if __name__ == "__main__":
-    main()

+# === app.py ===
 import os
+os.environ["STREAMLIT_HOME"] = "/app/.streamlit"  # Must be first
 import streamlit as st
 import google.generativeai as genai
 from PIL import Image
 import fitz  # PyMuPDF
 from docx import Document
+import pytesseract
+import io
 import json
 from pathlib import Path
 from datetime import datetime
 import re
+# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
 def extract_text_from_pdf(pdf_file):
+    """Extract text from PDF, with OCR fallback for scanned PDFs."""
     text_content = []
+    pdf_bytes = pdf_file.read()
     try:
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        for page in doc:
+            page_text = page.get_text()
+            if not page_text.strip():
+                # Fallback to OCR
+                pix = page.get_pixmap()
+                img = Image.open(io.BytesIO(pix.tobytes("png")))
+                page_text = pytesseract.image_to_string(img)
+            text_content.append(page_text)
         return "\n".join(text_content)
     except Exception as e:
+        st.error(f"PDF extraction error: {str(e)}")
         return ""
 def extract_text_from_docx(docx_file):
     try:
         doc = Document(docx_file)
+        return "\n".join([p.text for p in doc.paragraphs])
     except Exception as e:
+        st.error(f"DOCX extraction error: {str(e)}")
         return ""
+def extract_text_from_image(image_file):
+    try:
+        image = Image.open(image_file)
+        return pytesseract.image_to_string(image)
+    except Exception as e:
+        st.error(f"Image extraction error: {str(e)}")
+        return ""
+# ---------------- DATE / EXPERIENCE CALCULATION ----------------
 def parse_date(date_str):
     try:
+        if date_str.lower() in ["present", "current", "now"]:
             return datetime.now()
         date_str = date_str.strip()
+        formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
         for fmt in formats:
             try:
                 return datetime.strptime(date_str, fmt)
+            except:
                 continue
+        year_match = re.search(r"\b20\d{2}\b", date_str)
         if year_match:
+            return datetime.strptime(year_match.group(), "%Y")
         return None
+    except:
         return None
 def calculate_experience(work_history):
+    total_exp = 0
     for job in work_history:
+        duration = job.get("duration", "")
         if not duration:
             continue
+        parts = re.split(r"\s*-\s*|\s+to\s+", duration)
         if len(parts) != 2:
             continue
+        start, end = parse_date(parts[0]), parse_date(parts[1])
+        if start and end:
+            years = (end.year - start.year) + (end.month - start.month)/12
+            total_exp += max(0, years)
+    return round(total_exp, 1)
+# ---------------- RESUME PARSING ----------------
 def parse_resume(file_uploaded, api_key):
     genai.configure(api_key=api_key)
+    model = genai.GenerativeModel("gemini-1.5-flash")
     prompt = """Extract the following information from this resume:
+1. Summarize in 100 words, focus on skills, experience, qualifications.
+2. Full Name
+3. Email
+4. Phone
+5. Education (degree, institution, year, field)
+6. Work experience with exact duration (e.g., Jan 2020 - Present)
+7. Skills
+8. LinkedIn URL
+Return as JSON:
+{
+"summary": "", "name": "", "email": "", "phone": "",
+"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
+"work_experience": [{"company": "", "position": "", "duration": ""}],
+"skills": [], "linkedin": ""
+}"""
+    # Extract text
+    ext = Path(file_uploaded.name).suffix.lower()
+    if ext == ".pdf":
+        text_content = extract_text_from_pdf(file_uploaded)
+    elif ext in [".docx", ".doc"]:
+        text_content = extract_text_from_docx(file_uploaded)
+    elif ext in [".jpg", ".jpeg", ".png"]:
+        text_content = extract_text_from_image(file_uploaded)
+    else:
+        st.error(f"Unsupported file type: {ext}")
+        return None
+    if not text_content.strip():
+        st.error("No text found in resume.")
+        return None
+    # Generate JSON from Gemini
+    try:
         response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
+        response_text = response.text
+        st.text_area("Raw Response", response_text, height=200)  # Debugging
+        # Extract JSON
+        json_start = response_text.find("{")
+        json_end = response_text.rfind("}") + 1
+        json_str = response_text[json_start:json_end]
+        result = json.loads(json_str)
+        result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
+        return result
     except Exception as e:
+        st.error(f"Error parsing resume: {str(e)}")
         return None
+# ---------------- FORMAT EDUCATION ----------------
 def format_education(edu):
     parts = []
+    if edu.get("degree"):
+        parts.append(edu["degree"])
+    if edu.get("field"):
         parts.append(f"in {edu['field']}")
+    if edu.get("institution"):
         parts.append(f"from {edu['institution']}")
+    if edu.get("year"):
         parts.append(f"({edu['year']})")
+    if edu.get("gpa"):
         parts.append(f"- GPA: {edu['gpa']}")
     return " ".join(parts)
+# ---------------- MAIN APP ----------------
 def main():
+    st.title("Resume Parser (PDF/DOCX/Image)")
     api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
+    uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])
     if uploaded_file and api_key:
+        with st.spinner("Analyzing resume..."):
             result = parse_resume(uploaded_file, api_key)
+        if result:
+            st.subheader("Extracted Information")
+            st.text_area("Summary", result.get("summary",""), height=100)
+            col1, col2, col3 = st.columns(3)
+            col1.write("**Name:** "+result.get("name",""))
+            col2.write("**Email:** "+result.get("email",""))
+            col3.write("**Phone:** "+result.get("phone",""))
+            exp = result.get("total_years_experience",0)
+            exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
+            st.write("**Total Experience:**", exp_text)
+            st.subheader("Education")
+            for edu in result.get("education", []):
+                st.write("- "+format_education(edu))
+            st.subheader("Work Experience")
+            for w in result.get("work_experience", []):
+                dur = f" ({w.get('duration','')})" if w.get("duration") else ""
+                st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")
+            st.subheader("Skills")
+            for s in result.get("skills", []):
+                st.write("- "+s)
+            st.write("**LinkedIn:**", result.get("linkedin",""))
 if __name__ == "__main__":
+    main()