|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
|
|
|
|
import google.generativeai as genai |
|
|
from PIL import Image |
|
|
import fitz |
|
|
from docx import Document |
|
|
import pytesseract |
|
|
import io |
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
import re |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
"""Extract text from PDF, with OCR fallback for scanned PDFs.""" |
|
|
text_content = [] |
|
|
pdf_bytes = pdf_file.read() |
|
|
try: |
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
for page in doc: |
|
|
page_text = page.get_text() |
|
|
if not page_text.strip(): |
|
|
|
|
|
pix = page.get_pixmap() |
|
|
img = Image.open(io.BytesIO(pix.tobytes("png"))) |
|
|
page_text = pytesseract.image_to_string(img) |
|
|
text_content.append(page_text) |
|
|
return "\n".join(text_content) |
|
|
except Exception as e: |
|
|
st.error(f"PDF extraction error: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def extract_text_from_docx(docx_file): |
|
|
try: |
|
|
doc = Document(docx_file) |
|
|
return "\n".join([p.text for p in doc.paragraphs]) |
|
|
except Exception as e: |
|
|
st.error(f"DOCX extraction error: {str(e)}") |
|
|
return "" |
|
|
|
|
|
def extract_text_from_image(image_file): |
|
|
try: |
|
|
image = Image.open(image_file) |
|
|
return pytesseract.image_to_string(image) |
|
|
except Exception as e: |
|
|
st.error(f"Image extraction error: {str(e)}") |
|
|
return "" |
|
|
|
|
|
|
|
|
def parse_date(date_str): |
|
|
try: |
|
|
if date_str.lower() in ["present", "current", "now"]: |
|
|
return datetime.now() |
|
|
date_str = date_str.strip() |
|
|
formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"] |
|
|
for fmt in formats: |
|
|
try: |
|
|
return datetime.strptime(date_str, fmt) |
|
|
except: |
|
|
continue |
|
|
year_match = re.search(r"\b20\d{2}\b", date_str) |
|
|
if year_match: |
|
|
return datetime.strptime(year_match.group(), "%Y") |
|
|
return None |
|
|
except: |
|
|
return None |
|
|
|
|
|
def calculate_experience(work_history): |
|
|
total_exp = 0 |
|
|
for job in work_history: |
|
|
duration = job.get("duration", "") |
|
|
if not duration: |
|
|
continue |
|
|
parts = re.split(r"\s*-\s*|\s+to\s+", duration) |
|
|
if len(parts) != 2: |
|
|
continue |
|
|
start, end = parse_date(parts[0]), parse_date(parts[1]) |
|
|
if start and end: |
|
|
years = (end.year - start.year) + (end.month - start.month)/12 |
|
|
total_exp += max(0, years) |
|
|
return round(total_exp, 1) |
|
|
|
|
|
|
|
|
def parse_resume(file_uploaded, api_key): |
|
|
genai.configure(api_key=api_key) |
|
|
model = genai.GenerativeModel("gemini-1.5-flash") |
|
|
|
|
|
prompt = """Extract the following information from this resume: |
|
|
1. Summarize in 100 words, focus on skills, experience, qualifications. |
|
|
2. Full Name |
|
|
3. Email |
|
|
4. Phone |
|
|
5. Education (degree, institution, year, field) |
|
|
6. Work experience with exact duration (e.g., Jan 2020 - Present) |
|
|
7. Skills |
|
|
8. LinkedIn URL |
|
|
|
|
|
Return as JSON: |
|
|
{ |
|
|
"summary": "", "name": "", "email": "", "phone": "", |
|
|
"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}], |
|
|
"work_experience": [{"company": "", "position": "", "duration": ""}], |
|
|
"skills": [], "linkedin": "" |
|
|
}""" |
|
|
|
|
|
|
|
|
ext = Path(file_uploaded.name).suffix.lower() |
|
|
if ext == ".pdf": |
|
|
text_content = extract_text_from_pdf(file_uploaded) |
|
|
elif ext in [".docx", ".doc"]: |
|
|
text_content = extract_text_from_docx(file_uploaded) |
|
|
elif ext in [".jpg", ".jpeg", ".png"]: |
|
|
text_content = extract_text_from_image(file_uploaded) |
|
|
else: |
|
|
st.error(f"Unsupported file type: {ext}") |
|
|
return None |
|
|
|
|
|
if not text_content.strip(): |
|
|
st.error("No text found in resume.") |
|
|
return None |
|
|
|
|
|
|
|
|
try: |
|
|
response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}") |
|
|
response_text = response.text |
|
|
st.text_area("Raw Response", response_text, height=200) |
|
|
|
|
|
|
|
|
json_start = response_text.find("{") |
|
|
json_end = response_text.rfind("}") + 1 |
|
|
json_str = response_text[json_start:json_end] |
|
|
result = json.loads(json_str) |
|
|
|
|
|
result["total_years_experience"] = calculate_experience(result.get("work_experience", [])) |
|
|
return result |
|
|
except Exception as e: |
|
|
st.error(f"Error parsing resume: {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
def format_education(edu): |
|
|
parts = [] |
|
|
if edu.get("degree"): |
|
|
parts.append(edu["degree"]) |
|
|
if edu.get("field"): |
|
|
parts.append(f"in {edu['field']}") |
|
|
if edu.get("institution"): |
|
|
parts.append(f"from {edu['institution']}") |
|
|
if edu.get("year"): |
|
|
parts.append(f"({edu['year']})") |
|
|
if edu.get("gpa"): |
|
|
parts.append(f"- GPA: {edu['gpa']}") |
|
|
return " ".join(parts) |
|
|
|
|
|
|
|
|
def main(): |
|
|
st.title("Resume Parser (PDF/DOCX/Image)") |
|
|
api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password") |
|
|
uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"]) |
|
|
|
|
|
if uploaded_file and api_key: |
|
|
with st.spinner("Analyzing resume..."): |
|
|
result = parse_resume(uploaded_file, api_key) |
|
|
|
|
|
if result: |
|
|
st.subheader("Extracted Information") |
|
|
st.text_area("Summary", result.get("summary",""), height=100) |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
col1.write("**Name:** "+result.get("name","")) |
|
|
col2.write("**Email:** "+result.get("email","")) |
|
|
col3.write("**Phone:** "+result.get("phone","")) |
|
|
|
|
|
exp = result.get("total_years_experience",0) |
|
|
exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months" |
|
|
st.write("**Total Experience:**", exp_text) |
|
|
|
|
|
st.subheader("Education") |
|
|
for edu in result.get("education", []): |
|
|
st.write("- "+format_education(edu)) |
|
|
|
|
|
st.subheader("Work Experience") |
|
|
for w in result.get("work_experience", []): |
|
|
dur = f" ({w.get('duration','')})" if w.get("duration") else "" |
|
|
st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}") |
|
|
|
|
|
st.subheader("Skills") |
|
|
for s in result.get("skills", []): |
|
|
st.write("- "+s) |
|
|
|
|
|
st.write("**LinkedIn:**", result.get("linkedin","")) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|