Spaces:

bangaboy
/

pp

Sleeping

File size: 6,888 Bytes

# === app.py ===

import os
# os.environ["STREAMLIT_HOME"] = "/pp/.streamlit"  # folder inside container with write permissions


import streamlit as st

import google.generativeai as genai
from PIL import Image
import fitz  # PyMuPDF
from docx import Document
import pytesseract
import io
import json
from pathlib import Path
from datetime import datetime
import re

# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
def extract_text_from_pdf(pdf_file):
    """Extract text from PDF, with OCR fallback for scanned PDFs."""
    text_content = []
    pdf_bytes = pdf_file.read()
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        for page in doc:
            page_text = page.get_text()
            if not page_text.strip():
                # Fallback to OCR
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes("png")))
                page_text = pytesseract.image_to_string(img)
            text_content.append(page_text)
        return "\n".join(text_content)
    except Exception as e:
        st.error(f"PDF extraction error: {str(e)}")
        return ""

def extract_text_from_docx(docx_file):
    try:
        doc = Document(docx_file)
        return "\n".join([p.text for p in doc.paragraphs])
    except Exception as e:
        st.error(f"DOCX extraction error: {str(e)}")
        return ""

def extract_text_from_image(image_file):
    try:
        image = Image.open(image_file)
        return pytesseract.image_to_string(image)
    except Exception as e:
        st.error(f"Image extraction error: {str(e)}")
        return ""

# ---------------- DATE / EXPERIENCE CALCULATION ----------------
def parse_date(date_str):
    try:
        if date_str.lower() in ["present", "current", "now"]:
            return datetime.now()
        date_str = date_str.strip()
        formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
        for fmt in formats:
            try:
                return datetime.strptime(date_str, fmt)
            except:
                continue
        year_match = re.search(r"\b20\d{2}\b", date_str)
        if year_match:
            return datetime.strptime(year_match.group(), "%Y")
        return None
    except:
        return None

def calculate_experience(work_history):
    total_exp = 0
    for job in work_history:
        duration = job.get("duration", "")
        if not duration:
            continue
        parts = re.split(r"\s*-\s*|\s+to\s+", duration)
        if len(parts) != 2:
            continue
        start, end = parse_date(parts[0]), parse_date(parts[1])
        if start and end:
            years = (end.year - start.year) + (end.month - start.month)/12
            total_exp += max(0, years)
    return round(total_exp, 1)

# ---------------- RESUME PARSING ----------------
def parse_resume(file_uploaded, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")

    prompt = """Extract the following information from this resume:
1. Summarize in 100 words, focus on skills, experience, qualifications.
2. Full Name
3. Email
4. Phone
5. Education (degree, institution, year, field)
6. Work experience with exact duration (e.g., Jan 2020 - Present)
7. Skills
8. LinkedIn URL

Return as JSON:
{
"summary": "", "name": "", "email": "", "phone": "",
"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
"work_experience": [{"company": "", "position": "", "duration": ""}],
"skills": [], "linkedin": ""
}"""

    # Extract text
    ext = Path(file_uploaded.name).suffix.lower()
    if ext == ".pdf":
        text_content = extract_text_from_pdf(file_uploaded)
    elif ext in [".docx", ".doc"]:
        text_content = extract_text_from_docx(file_uploaded)
    elif ext in [".jpg", ".jpeg", ".png"]:
        text_content = extract_text_from_image(file_uploaded)
    else:
        st.error(f"Unsupported file type: {ext}")
        return None

    if not text_content.strip():
        st.error("No text found in resume.")
        return None

    # Generate JSON from Gemini
    try:
        response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
        response_text = response.text
        st.text_area("Raw Response", response_text, height=200)  # Debugging

        # Extract JSON
        json_start = response_text.find("{")
        json_end = response_text.rfind("}") + 1
        json_str = response_text[json_start:json_end]
        result = json.loads(json_str)

        result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
        return result
    except Exception as e:
        st.error(f"Error parsing resume: {str(e)}")
        return None

# ---------------- FORMAT EDUCATION ----------------
def format_education(edu):
    parts = []
    if edu.get("degree"):
        parts.append(edu["degree"])
    if edu.get("field"):
        parts.append(f"in {edu['field']}")
    if edu.get("institution"):
        parts.append(f"from {edu['institution']}")
    if edu.get("year"):
        parts.append(f"({edu['year']})")
    if edu.get("gpa"):
        parts.append(f"- GPA: {edu['gpa']}")
    return " ".join(parts)

# ---------------- MAIN APP ----------------
def main():
    st.title("Resume Parser (PDF/DOCX/Image)")
    api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
    uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])

    if uploaded_file and api_key:
        with st.spinner("Analyzing resume..."):
            result = parse_resume(uploaded_file, api_key)

        if result:
            st.subheader("Extracted Information")
            st.text_area("Summary", result.get("summary",""), height=100)
            
            col1, col2, col3 = st.columns(3)
            col1.write("**Name:** "+result.get("name",""))
            col2.write("**Email:** "+result.get("email",""))
            col3.write("**Phone:** "+result.get("phone",""))

            exp = result.get("total_years_experience",0)
            exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
            st.write("**Total Experience:**", exp_text)

            st.subheader("Education")
            for edu in result.get("education", []):
                st.write("- "+format_education(edu))

            st.subheader("Work Experience")
            for w in result.get("work_experience", []):
                dur = f" ({w.get('duration','')})" if w.get("duration") else ""
                st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")

            st.subheader("Skills")
            for s in result.get("skills", []):
                st.write("- "+s)

            st.write("**LinkedIn:**", result.get("linkedin",""))

if __name__ == "__main__":
    main()