pp / src /streamlit_app.py
bangaboy's picture
Update src/streamlit_app.py
3772f08 verified
# === app.py ===
import os
# os.environ["STREAMLIT_HOME"] = "/pp/.streamlit" # folder inside container with write permissions
import streamlit as st
import google.generativeai as genai
from PIL import Image
import fitz # PyMuPDF
from docx import Document
import pytesseract
import io
import json
from pathlib import Path
from datetime import datetime
import re
# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF, with OCR fallback for scanned PDFs."""
text_content = []
pdf_bytes = pdf_file.read()
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
for page in doc:
page_text = page.get_text()
if not page_text.strip():
# Fallback to OCR
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
page_text = pytesseract.image_to_string(img)
text_content.append(page_text)
return "\n".join(text_content)
except Exception as e:
st.error(f"PDF extraction error: {str(e)}")
return ""
def extract_text_from_docx(docx_file):
try:
doc = Document(docx_file)
return "\n".join([p.text for p in doc.paragraphs])
except Exception as e:
st.error(f"DOCX extraction error: {str(e)}")
return ""
def extract_text_from_image(image_file):
try:
image = Image.open(image_file)
return pytesseract.image_to_string(image)
except Exception as e:
st.error(f"Image extraction error: {str(e)}")
return ""
# ---------------- DATE / EXPERIENCE CALCULATION ----------------
def parse_date(date_str):
try:
if date_str.lower() in ["present", "current", "now"]:
return datetime.now()
date_str = date_str.strip()
formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except:
continue
year_match = re.search(r"\b20\d{2}\b", date_str)
if year_match:
return datetime.strptime(year_match.group(), "%Y")
return None
except:
return None
def calculate_experience(work_history):
total_exp = 0
for job in work_history:
duration = job.get("duration", "")
if not duration:
continue
parts = re.split(r"\s*-\s*|\s+to\s+", duration)
if len(parts) != 2:
continue
start, end = parse_date(parts[0]), parse_date(parts[1])
if start and end:
years = (end.year - start.year) + (end.month - start.month)/12
total_exp += max(0, years)
return round(total_exp, 1)
# ---------------- RESUME PARSING ----------------
def parse_resume(file_uploaded, api_key):
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")
prompt = """Extract the following information from this resume:
1. Summarize in 100 words, focus on skills, experience, qualifications.
2. Full Name
3. Email
4. Phone
5. Education (degree, institution, year, field)
6. Work experience with exact duration (e.g., Jan 2020 - Present)
7. Skills
8. LinkedIn URL
Return as JSON:
{
"summary": "", "name": "", "email": "", "phone": "",
"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
"work_experience": [{"company": "", "position": "", "duration": ""}],
"skills": [], "linkedin": ""
}"""
# Extract text
ext = Path(file_uploaded.name).suffix.lower()
if ext == ".pdf":
text_content = extract_text_from_pdf(file_uploaded)
elif ext in [".docx", ".doc"]:
text_content = extract_text_from_docx(file_uploaded)
elif ext in [".jpg", ".jpeg", ".png"]:
text_content = extract_text_from_image(file_uploaded)
else:
st.error(f"Unsupported file type: {ext}")
return None
if not text_content.strip():
st.error("No text found in resume.")
return None
# Generate JSON from Gemini
try:
response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
response_text = response.text
st.text_area("Raw Response", response_text, height=200) # Debugging
# Extract JSON
json_start = response_text.find("{")
json_end = response_text.rfind("}") + 1
json_str = response_text[json_start:json_end]
result = json.loads(json_str)
result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
return result
except Exception as e:
st.error(f"Error parsing resume: {str(e)}")
return None
# ---------------- FORMAT EDUCATION ----------------
def format_education(edu):
parts = []
if edu.get("degree"):
parts.append(edu["degree"])
if edu.get("field"):
parts.append(f"in {edu['field']}")
if edu.get("institution"):
parts.append(f"from {edu['institution']}")
if edu.get("year"):
parts.append(f"({edu['year']})")
if edu.get("gpa"):
parts.append(f"- GPA: {edu['gpa']}")
return " ".join(parts)
# ---------------- MAIN APP ----------------
def main():
st.title("Resume Parser (PDF/DOCX/Image)")
api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])
if uploaded_file and api_key:
with st.spinner("Analyzing resume..."):
result = parse_resume(uploaded_file, api_key)
if result:
st.subheader("Extracted Information")
st.text_area("Summary", result.get("summary",""), height=100)
col1, col2, col3 = st.columns(3)
col1.write("**Name:** "+result.get("name",""))
col2.write("**Email:** "+result.get("email",""))
col3.write("**Phone:** "+result.get("phone",""))
exp = result.get("total_years_experience",0)
exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
st.write("**Total Experience:**", exp_text)
st.subheader("Education")
for edu in result.get("education", []):
st.write("- "+format_education(edu))
st.subheader("Work Experience")
for w in result.get("work_experience", []):
dur = f" ({w.get('duration','')})" if w.get("duration") else ""
st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")
st.subheader("Skills")
for s in result.get("skills", []):
st.write("- "+s)
st.write("**LinkedIn:**", result.get("linkedin",""))
if __name__ == "__main__":
main()