File size: 6,888 Bytes
0132466
 
e37763c
3772f08
8fc8191
0132466
3812a17
75e0714
76a67a2
 
 
 
0132466
 
76a67a2
 
 
 
3812a17
0132466
76a67a2
0132466
de06fc0
0132466
76a67a2
de06fc0
0132466
 
 
 
 
 
 
 
de06fc0
76a67a2
0132466
de06fc0
76a67a2
 
 
 
0132466
76a67a2
0132466
76a67a2
 
0132466
 
 
 
 
 
 
 
 
76a67a2
 
0132466
76a67a2
 
0132466
76a67a2
 
 
0132466
76a67a2
0132466
76a67a2
0132466
76a67a2
0132466
76a67a2
 
 
0132466
76a67a2
0132466
76a67a2
 
0132466
76a67a2
 
0132466
 
 
 
 
76a67a2
0132466
de06fc0
 
0132466
de06fc0
 
0132466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de06fc0
0132466
 
 
de06fc0
0132466
 
de06fc0
0132466
 
 
 
 
 
 
 
de06fc0
0132466
 
76a67a2
0132466
76a67a2
 
0132466
76a67a2
 
0132466
 
 
76a67a2
0132466
76a67a2
0132466
76a67a2
0132466
76a67a2
de06fc0
28843bf
0132466
76a67a2
0132466
0360d96
0132466
de06fc0
 
0132466
de06fc0
 
0132466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76a67a2
 
0132466
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# === app.py ===

import os
# os.environ["STREAMLIT_HOME"] = "/pp/.streamlit"  # folder inside container with write permissions


import streamlit as st

import google.generativeai as genai
from PIL import Image
import fitz  # PyMuPDF
from docx import Document
import pytesseract
import io
import json
from pathlib import Path
from datetime import datetime
import re

# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
def extract_text_from_pdf(pdf_file):
    """Extract text from PDF, with OCR fallback for scanned PDFs."""
    text_content = []
    pdf_bytes = pdf_file.read()
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        for page in doc:
            page_text = page.get_text()
            if not page_text.strip():
                # Fallback to OCR
                pix = page.get_pixmap()
                img = Image.open(io.BytesIO(pix.tobytes("png")))
                page_text = pytesseract.image_to_string(img)
            text_content.append(page_text)
        return "\n".join(text_content)
    except Exception as e:
        st.error(f"PDF extraction error: {str(e)}")
        return ""

def extract_text_from_docx(docx_file):
    try:
        doc = Document(docx_file)
        return "\n".join([p.text for p in doc.paragraphs])
    except Exception as e:
        st.error(f"DOCX extraction error: {str(e)}")
        return ""

def extract_text_from_image(image_file):
    try:
        image = Image.open(image_file)
        return pytesseract.image_to_string(image)
    except Exception as e:
        st.error(f"Image extraction error: {str(e)}")
        return ""

# ---------------- DATE / EXPERIENCE CALCULATION ----------------
def parse_date(date_str):
    try:
        if date_str.lower() in ["present", "current", "now"]:
            return datetime.now()
        date_str = date_str.strip()
        formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
        for fmt in formats:
            try:
                return datetime.strptime(date_str, fmt)
            except:
                continue
        year_match = re.search(r"\b20\d{2}\b", date_str)
        if year_match:
            return datetime.strptime(year_match.group(), "%Y")
        return None
    except:
        return None

def calculate_experience(work_history):
    total_exp = 0
    for job in work_history:
        duration = job.get("duration", "")
        if not duration:
            continue
        parts = re.split(r"\s*-\s*|\s+to\s+", duration)
        if len(parts) != 2:
            continue
        start, end = parse_date(parts[0]), parse_date(parts[1])
        if start and end:
            years = (end.year - start.year) + (end.month - start.month)/12
            total_exp += max(0, years)
    return round(total_exp, 1)

# ---------------- RESUME PARSING ----------------
def parse_resume(file_uploaded, api_key):
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")

    prompt = """Extract the following information from this resume:
1. Summarize in 100 words, focus on skills, experience, qualifications.
2. Full Name
3. Email
4. Phone
5. Education (degree, institution, year, field)
6. Work experience with exact duration (e.g., Jan 2020 - Present)
7. Skills
8. LinkedIn URL

Return as JSON:
{
"summary": "", "name": "", "email": "", "phone": "",
"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
"work_experience": [{"company": "", "position": "", "duration": ""}],
"skills": [], "linkedin": ""
}"""

    # Extract text
    ext = Path(file_uploaded.name).suffix.lower()
    if ext == ".pdf":
        text_content = extract_text_from_pdf(file_uploaded)
    elif ext in [".docx", ".doc"]:
        text_content = extract_text_from_docx(file_uploaded)
    elif ext in [".jpg", ".jpeg", ".png"]:
        text_content = extract_text_from_image(file_uploaded)
    else:
        st.error(f"Unsupported file type: {ext}")
        return None

    if not text_content.strip():
        st.error("No text found in resume.")
        return None

    # Generate JSON from Gemini
    try:
        response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
        response_text = response.text
        st.text_area("Raw Response", response_text, height=200)  # Debugging

        # Extract JSON
        json_start = response_text.find("{")
        json_end = response_text.rfind("}") + 1
        json_str = response_text[json_start:json_end]
        result = json.loads(json_str)

        result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
        return result
    except Exception as e:
        st.error(f"Error parsing resume: {str(e)}")
        return None

# ---------------- FORMAT EDUCATION ----------------
def format_education(edu):
    parts = []
    if edu.get("degree"):
        parts.append(edu["degree"])
    if edu.get("field"):
        parts.append(f"in {edu['field']}")
    if edu.get("institution"):
        parts.append(f"from {edu['institution']}")
    if edu.get("year"):
        parts.append(f"({edu['year']})")
    if edu.get("gpa"):
        parts.append(f"- GPA: {edu['gpa']}")
    return " ".join(parts)

# ---------------- MAIN APP ----------------
def main():
    st.title("Resume Parser (PDF/DOCX/Image)")
    api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
    uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])

    if uploaded_file and api_key:
        with st.spinner("Analyzing resume..."):
            result = parse_resume(uploaded_file, api_key)

        if result:
            st.subheader("Extracted Information")
            st.text_area("Summary", result.get("summary",""), height=100)
            
            col1, col2, col3 = st.columns(3)
            col1.write("**Name:** "+result.get("name",""))
            col2.write("**Email:** "+result.get("email",""))
            col3.write("**Phone:** "+result.get("phone",""))

            exp = result.get("total_years_experience",0)
            exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
            st.write("**Total Experience:**", exp_text)

            st.subheader("Education")
            for edu in result.get("education", []):
                st.write("- "+format_education(edu))

            st.subheader("Work Experience")
            for w in result.get("work_experience", []):
                dur = f" ({w.get('duration','')})" if w.get("duration") else ""
                st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")

            st.subheader("Skills")
            for s in result.get("skills", []):
                st.write("- "+s)

            st.write("**LinkedIn:**", result.get("linkedin",""))

if __name__ == "__main__":
    main()