File size: 6,888 Bytes
0132466 e37763c 3772f08 8fc8191 0132466 3812a17 75e0714 76a67a2 0132466 76a67a2 3812a17 0132466 76a67a2 0132466 de06fc0 0132466 76a67a2 de06fc0 0132466 de06fc0 76a67a2 0132466 de06fc0 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 de06fc0 0132466 de06fc0 0132466 de06fc0 0132466 de06fc0 0132466 de06fc0 0132466 de06fc0 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 0132466 76a67a2 de06fc0 28843bf 0132466 76a67a2 0132466 0360d96 0132466 de06fc0 0132466 de06fc0 0132466 76a67a2 0132466 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# === app.py ===
import os
# os.environ["STREAMLIT_HOME"] = "/pp/.streamlit" # folder inside container with write permissions
import streamlit as st
import google.generativeai as genai
from PIL import Image
import fitz # PyMuPDF
from docx import Document
import pytesseract
import io
import json
from pathlib import Path
from datetime import datetime
import re
# ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF, with OCR fallback for scanned PDFs."""
text_content = []
pdf_bytes = pdf_file.read()
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
for page in doc:
page_text = page.get_text()
if not page_text.strip():
# Fallback to OCR
pix = page.get_pixmap()
img = Image.open(io.BytesIO(pix.tobytes("png")))
page_text = pytesseract.image_to_string(img)
text_content.append(page_text)
return "\n".join(text_content)
except Exception as e:
st.error(f"PDF extraction error: {str(e)}")
return ""
def extract_text_from_docx(docx_file):
try:
doc = Document(docx_file)
return "\n".join([p.text for p in doc.paragraphs])
except Exception as e:
st.error(f"DOCX extraction error: {str(e)}")
return ""
def extract_text_from_image(image_file):
try:
image = Image.open(image_file)
return pytesseract.image_to_string(image)
except Exception as e:
st.error(f"Image extraction error: {str(e)}")
return ""
# ---------------- DATE / EXPERIENCE CALCULATION ----------------
def parse_date(date_str):
try:
if date_str.lower() in ["present", "current", "now"]:
return datetime.now()
date_str = date_str.strip()
formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except:
continue
year_match = re.search(r"\b20\d{2}\b", date_str)
if year_match:
return datetime.strptime(year_match.group(), "%Y")
return None
except:
return None
def calculate_experience(work_history):
total_exp = 0
for job in work_history:
duration = job.get("duration", "")
if not duration:
continue
parts = re.split(r"\s*-\s*|\s+to\s+", duration)
if len(parts) != 2:
continue
start, end = parse_date(parts[0]), parse_date(parts[1])
if start and end:
years = (end.year - start.year) + (end.month - start.month)/12
total_exp += max(0, years)
return round(total_exp, 1)
# ---------------- RESUME PARSING ----------------
def parse_resume(file_uploaded, api_key):
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")
prompt = """Extract the following information from this resume:
1. Summarize in 100 words, focus on skills, experience, qualifications.
2. Full Name
3. Email
4. Phone
5. Education (degree, institution, year, field)
6. Work experience with exact duration (e.g., Jan 2020 - Present)
7. Skills
8. LinkedIn URL
Return as JSON:
{
"summary": "", "name": "", "email": "", "phone": "",
"education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
"work_experience": [{"company": "", "position": "", "duration": ""}],
"skills": [], "linkedin": ""
}"""
# Extract text
ext = Path(file_uploaded.name).suffix.lower()
if ext == ".pdf":
text_content = extract_text_from_pdf(file_uploaded)
elif ext in [".docx", ".doc"]:
text_content = extract_text_from_docx(file_uploaded)
elif ext in [".jpg", ".jpeg", ".png"]:
text_content = extract_text_from_image(file_uploaded)
else:
st.error(f"Unsupported file type: {ext}")
return None
if not text_content.strip():
st.error("No text found in resume.")
return None
# Generate JSON from Gemini
try:
response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
response_text = response.text
st.text_area("Raw Response", response_text, height=200) # Debugging
# Extract JSON
json_start = response_text.find("{")
json_end = response_text.rfind("}") + 1
json_str = response_text[json_start:json_end]
result = json.loads(json_str)
result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
return result
except Exception as e:
st.error(f"Error parsing resume: {str(e)}")
return None
# ---------------- FORMAT EDUCATION ----------------
def format_education(edu):
parts = []
if edu.get("degree"):
parts.append(edu["degree"])
if edu.get("field"):
parts.append(f"in {edu['field']}")
if edu.get("institution"):
parts.append(f"from {edu['institution']}")
if edu.get("year"):
parts.append(f"({edu['year']})")
if edu.get("gpa"):
parts.append(f"- GPA: {edu['gpa']}")
return " ".join(parts)
# ---------------- MAIN APP ----------------
def main():
st.title("Resume Parser (PDF/DOCX/Image)")
api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])
if uploaded_file and api_key:
with st.spinner("Analyzing resume..."):
result = parse_resume(uploaded_file, api_key)
if result:
st.subheader("Extracted Information")
st.text_area("Summary", result.get("summary",""), height=100)
col1, col2, col3 = st.columns(3)
col1.write("**Name:** "+result.get("name",""))
col2.write("**Email:** "+result.get("email",""))
col3.write("**Phone:** "+result.get("phone",""))
exp = result.get("total_years_experience",0)
exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
st.write("**Total Experience:**", exp_text)
st.subheader("Education")
for edu in result.get("education", []):
st.write("- "+format_education(edu))
st.subheader("Work Experience")
for w in result.get("work_experience", []):
dur = f" ({w.get('duration','')})" if w.get("duration") else ""
st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")
st.subheader("Skills")
for s in result.get("skills", []):
st.write("- "+s)
st.write("**LinkedIn:**", result.get("linkedin",""))
if __name__ == "__main__":
main()
|