# === app.py === import os # os.environ["STREAMLIT_HOME"] = "/pp/.streamlit" # folder inside container with write permissions import streamlit as st import google.generativeai as genai from PIL import Image import fitz # PyMuPDF from docx import Document import pytesseract import io import json from pathlib import Path from datetime import datetime import re # ---------------- PDF / DOCX / IMAGE EXTRACTION ---------------- def extract_text_from_pdf(pdf_file): """Extract text from PDF, with OCR fallback for scanned PDFs.""" text_content = [] pdf_bytes = pdf_file.read() try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") for page in doc: page_text = page.get_text() if not page_text.strip(): # Fallback to OCR pix = page.get_pixmap() img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) text_content.append(page_text) return "\n".join(text_content) except Exception as e: st.error(f"PDF extraction error: {str(e)}") return "" def extract_text_from_docx(docx_file): try: doc = Document(docx_file) return "\n".join([p.text for p in doc.paragraphs]) except Exception as e: st.error(f"DOCX extraction error: {str(e)}") return "" def extract_text_from_image(image_file): try: image = Image.open(image_file) return pytesseract.image_to_string(image) except Exception as e: st.error(f"Image extraction error: {str(e)}") return "" # ---------------- DATE / EXPERIENCE CALCULATION ---------------- def parse_date(date_str): try: if date_str.lower() in ["present", "current", "now"]: return datetime.now() date_str = date_str.strip() formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"] for fmt in formats: try: return datetime.strptime(date_str, fmt) except: continue year_match = re.search(r"\b20\d{2}\b", date_str) if year_match: return datetime.strptime(year_match.group(), "%Y") return None except: return None def calculate_experience(work_history): total_exp = 0 for job in work_history: duration = job.get("duration", "") if not duration: continue parts = re.split(r"\s*-\s*|\s+to\s+", duration) if len(parts) != 2: continue start, end = parse_date(parts[0]), parse_date(parts[1]) if start and end: years = (end.year - start.year) + (end.month - start.month)/12 total_exp += max(0, years) return round(total_exp, 1) # ---------------- RESUME PARSING ---------------- def parse_resume(file_uploaded, api_key): genai.configure(api_key=api_key) model = genai.GenerativeModel("gemini-1.5-flash") prompt = """Extract the following information from this resume: 1. Summarize in 100 words, focus on skills, experience, qualifications. 2. Full Name 3. Email 4. Phone 5. Education (degree, institution, year, field) 6. Work experience with exact duration (e.g., Jan 2020 - Present) 7. Skills 8. LinkedIn URL Return as JSON: { "summary": "", "name": "", "email": "", "phone": "", "education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}], "work_experience": [{"company": "", "position": "", "duration": ""}], "skills": [], "linkedin": "" }""" # Extract text ext = Path(file_uploaded.name).suffix.lower() if ext == ".pdf": text_content = extract_text_from_pdf(file_uploaded) elif ext in [".docx", ".doc"]: text_content = extract_text_from_docx(file_uploaded) elif ext in [".jpg", ".jpeg", ".png"]: text_content = extract_text_from_image(file_uploaded) else: st.error(f"Unsupported file type: {ext}") return None if not text_content.strip(): st.error("No text found in resume.") return None # Generate JSON from Gemini try: response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") response_text = response.text st.text_area("Raw Response", response_text, height=200) # Debugging # Extract JSON json_start = response_text.find("{") json_end = response_text.rfind("}") + 1 json_str = response_text[json_start:json_end] result = json.loads(json_str) result["total_years_experience"] = calculate_experience(result.get("work_experience", [])) return result except Exception as e: st.error(f"Error parsing resume: {str(e)}") return None # ---------------- FORMAT EDUCATION ---------------- def format_education(edu): parts = [] if edu.get("degree"): parts.append(edu["degree"]) if edu.get("field"): parts.append(f"in {edu['field']}") if edu.get("institution"): parts.append(f"from {edu['institution']}") if edu.get("year"): parts.append(f"({edu['year']})") if edu.get("gpa"): parts.append(f"- GPA: {edu['gpa']}") return " ".join(parts) # ---------------- MAIN APP ---------------- def main(): st.title("Resume Parser (PDF/DOCX/Image)") api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password") uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"]) if uploaded_file and api_key: with st.spinner("Analyzing resume..."): result = parse_resume(uploaded_file, api_key) if result: st.subheader("Extracted Information") st.text_area("Summary", result.get("summary",""), height=100) col1, col2, col3 = st.columns(3) col1.write("**Name:** "+result.get("name","")) col2.write("**Email:** "+result.get("email","")) col3.write("**Phone:** "+result.get("phone","")) exp = result.get("total_years_experience",0) exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months" st.write("**Total Experience:**", exp_text) st.subheader("Education") for edu in result.get("education", []): st.write("- "+format_education(edu)) st.subheader("Work Experience") for w in result.get("work_experience", []): dur = f" ({w.get('duration','')})" if w.get("duration") else "" st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}") st.subheader("Skills") for s in result.get("skills", []): st.write("- "+s) st.write("**LinkedIn:**", result.get("linkedin","")) if __name__ == "__main__": main()