Spaces:
Sleeping
Sleeping
| import requests | |
| import re | |
| import json | |
| from typing import List, Dict, Any | |
| from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle | |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
| from reportlab.lib.pagesizes import A4 | |
| from reportlab.lib.units import inch | |
| from reportlab.lib.enums import TA_LEFT | |
| import google.generativeai as genai | |
| from dotenv import load_dotenv | |
| from pydantic import BaseModel | |
| from typing import List, Dict, Optional | |
| import pdfplumber | |
| import fitz # PyMuPDF | |
| import tempfile | |
| import os | |
| load_dotenv() | |
| class JobCrawler: | |
| """Job description crawler using Jina.ai""" | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| }) | |
| def fetch_markdown(self, url: str) -> str: | |
| """Fetch job description as markdown using Jina.ai""" | |
| jina_url = f"https://r.jina.ai/{url}" | |
| try: | |
| response = self.session.get(jina_url, timeout=30) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| raise Exception(f"Failed to fetch job description: {str(e)}") | |
| def extract_text(self, markdown: str, patterns: List[str]) -> str: | |
| """Extract text using regex patterns""" | |
| for pattern in patterns: | |
| match = re.search(pattern, markdown, re.MULTILINE | re.IGNORECASE) | |
| if match: | |
| return match.group(1).strip() if match.lastindex else match.group(0) | |
| return "Not Specified" | |
| def extract_list_items(self, markdown: str, section_name: str, max_items: int = 10) -> List[str]: | |
| """Extract bullet points from a specific section""" | |
| section_pattern = rf'(?:{section_name})[:\s]*\n([\s\S]*?)(?=\n#{1,3}\s|\n\*\*[A-Z]|$)' | |
| section_match = re.search(section_pattern, markdown, re.IGNORECASE) | |
| if not section_match: | |
| return [] | |
| section_text = section_match.group(1) | |
| items = [] | |
| for pattern in [r'[-•*]\s*([^\n]+)', r'\d+\.\s*([^\n]+)']: | |
| matches = re.findall(pattern, section_text) | |
| if matches: | |
| items.extend([m.strip() for m in matches[:max_items]]) | |
| break | |
| return items[:max_items] | |
| def crawl(self, url: str) -> dict: | |
| """Crawl a job posting URL and extract information""" | |
| markdown = self.fetch_markdown(url) | |
| return { | |
| 'title': self.extract_text(markdown, [r'^#\s+(.+)$', r'\*\*Job Title:\*\*\s*(.+)']), | |
| 'company': self.extract_text(markdown, [r'[Cc]ompany[:\s]+([^\n]+)', r'at\s+([A-Z][^\n,]+)']), | |
| 'location': self.extract_text(markdown, [r'[Ll]ocation[:\s]+([^\n]+)', r'\b(Remote|Hybrid|On-?site)\b']), | |
| 'salary': self.extract_text(markdown, [r'[Ss]alary[:\s]+([^\n]+)', r'\$[\d,]+-\$?[\d,]+']), | |
| 'job_type': self.extract_text(markdown, [r'\b(Full[- ]?time|Part[- ]?time|Contract|Internship)\b']), | |
| 'experience': self.extract_text(markdown, [r'(\d+)\+?\s*years?\s*(?:of\s*)?experience', r'\b(Entry[- ]?level|Mid[- ]?level|Senior|Junior)\b']), | |
| 'requirements': self.extract_list_items(markdown, r'requirements?|qualifications?'), | |
| 'responsibilities': self.extract_list_items(markdown, r'responsibilities?|duties|what you\'?ll do'), | |
| 'skills': self.extract_list_items(markdown, r'skills?|technologies?|tools?', 15), | |
| 'benefits': self.extract_list_items(markdown, r'benefits?|perks?|what we offer'), | |
| 'raw_markdown': markdown, | |
| 'source_url': url | |
| } | |
| def to_markdown(self, job_data: dict) -> str: | |
| """Convert job data to markdown format""" | |
| md = [ | |
| f"# {job_data['title']}\n", | |
| f"**Company:** {job_data['company']} ", | |
| f"**Location:** {job_data['location']} ", | |
| f"**Salary:** {job_data['salary']} ", | |
| f"**Job Type:** {job_data['job_type']} ", | |
| f"**Experience:** {job_data['experience']}\n", | |
| "\n---\n" | |
| ] | |
| if job_data['requirements']: | |
| md.append("\n## Requirements\n") | |
| md.extend([f"- {req}\n" for req in job_data['requirements']]) | |
| if job_data['responsibilities']: | |
| md.append("\n## Responsibilities\n") | |
| md.extend([f"- {resp}\n" for resp in job_data['responsibilities']]) | |
| if job_data['skills']: | |
| md.append("\n## Skills\n") | |
| md.extend([f"- {skill}\n" for skill in job_data['skills']]) | |
| if job_data['benefits']: | |
| md.append("\n## Benefits\n") | |
| md.extend([f"- {benefit}\n" for benefit in job_data['benefits']]) | |
| return "".join(md) | |
| # use resume | |
| load_dotenv() | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY","AIzaSyAiXk08VxFdC9d8axGj_tCMsRjg8YI6bZw") | |
| if GEMINI_API_KEY: | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| else: | |
| print("⚠️ No GEMINI_API_KEY found. Using local enhancement fallback.") | |
| # === Local fallback === | |
| def local_enhance(text: str) -> list[str]: | |
| base = text.strip().capitalize() | |
| return [ | |
| f"Led {base} to measurable outcomes and improved efficiency.", | |
| f"Implemented {base} using scalable, modern frameworks.", | |
| f"Optimized processes delivering consistent, ATS-friendly results.", | |
| ] | |
| # === Gemini enrichment === | |
| def enrich_bullets(data): | |
| items, mapping = [], [] | |
| for section_name in ["experience", "projects"]: | |
| for i, section in enumerate(data.get(section_name, [])): | |
| text = " ".join(section.get("description", [])) | |
| if text.strip(): | |
| items.append(text) | |
| mapping.append((section_name, i)) | |
| if not items: | |
| return data | |
| if not GEMINI_API_KEY: | |
| for section, i in mapping: | |
| data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", []))) | |
| return data | |
| try: | |
| model = genai.GenerativeModel("gemini-2.0-flash") | |
| prompt = ( | |
| "You are a professional resume writer optimizing for ATS systems.\n" | |
| "For each summary, generate exactly THREE short, single-line bullets (under 20 words).\n" | |
| "Each bullet should start with an action verb and highlight quantifiable results.\n" | |
| "Number them 1., 2., 3.\n\n" | |
| ) | |
| combined = "\n".join([f"Item {i+1}: {text}" for i, text in enumerate(items)]) | |
| response = model.generate_content(prompt + combined) | |
| enriched_blocks = response.text.strip().split("Item") | |
| for idx, block in enumerate(enriched_blocks): | |
| if idx == 0 or not block.strip(): | |
| continue | |
| lines = [ | |
| ln.split(". ", 1)[1].strip() | |
| for ln in block.splitlines() | |
| if ln.strip().startswith(("1.", "2.", "3.")) | |
| ] | |
| if not lines: | |
| lines = local_enhance(items[idx - 1]) | |
| section, i = mapping[idx - 1] | |
| data[section][i]["enhanced"] = lines[:3] | |
| except Exception as e: | |
| print("⚠️ Gemini failed:", e) | |
| for section, i in mapping: | |
| data[section][i]["enhanced"] = local_enhance(" ".join(data[section][i].get("description", []))) | |
| return data | |
| # === PDF Generator === | |
| def create_pdf(data, output_path): | |
| styles = getSampleStyleSheet() | |
| # Fonts & spacing | |
| normal = ParagraphStyle( | |
| "NormalCustom", | |
| parent=styles["Normal"], | |
| fontName="Times-Roman", | |
| fontSize=10.5, | |
| leading=13, | |
| spaceAfter=2, | |
| ) | |
| indent = ParagraphStyle( | |
| "Indented", | |
| parent=normal, | |
| leftIndent=25, # Indent for bullets and sub-lines | |
| spaceAfter=1.5, | |
| ) | |
| heading = ParagraphStyle( | |
| "HeadingCustom", | |
| parent=styles["Heading2"], | |
| fontName="Times-Bold", | |
| fontSize=12.5, | |
| leading=14, | |
| spaceBefore=10, | |
| spaceAfter=4, | |
| alignment=TA_LEFT, | |
| ) | |
| name_style = ParagraphStyle( | |
| "NameStyle", | |
| parent=styles["Title"], | |
| fontName="Times-Bold", | |
| fontSize=16, | |
| leading=18, | |
| spaceAfter=6, | |
| ) | |
| doc = SimpleDocTemplate( | |
| output_path, | |
| pagesize=A4, | |
| rightMargin=36, | |
| leftMargin=36, | |
| topMargin=36, | |
| bottomMargin=36, | |
| ) | |
| story = [] | |
| # === Header === | |
| story.append(Paragraph(f"<b>{data.get('name', '')}</b>", name_style)) | |
| profile = data.get("profile", {}) | |
| contact = " | ".join( | |
| filter( | |
| None, | |
| [ | |
| profile.get("location"), | |
| profile.get("phone"), | |
| profile.get("email"), | |
| profile.get("linkedin"), | |
| profile.get("github"), | |
| ], | |
| ) | |
| ) | |
| story.append(Paragraph(contact, normal)) | |
| story.append(Spacer(1, 0.12 * inch)) | |
| # Helper: aligned two-column row (Title left, Date right) | |
| def aligned_row(left_text, right_text): | |
| left = Paragraph(left_text, normal) | |
| right = Paragraph(right_text or "", normal) | |
| table = Table([[left, right]], colWidths=[4.8 * inch, 1.5 * inch]) | |
| table.setStyle( | |
| TableStyle( | |
| [ | |
| ("VALIGN", (0, 0), (-1, -1), "TOP"), | |
| ("ALIGN", (1, 0), (1, 0), "RIGHT"), | |
| ("BOTTOMPADDING", (0, 0), (-1, -1), 0), | |
| ("TOPPADDING", (0, 0), (-1, -1), 0), | |
| ] | |
| ) | |
| ) | |
| return table | |
| # === Education === | |
| if data.get("education"): | |
| story.append(Paragraph("EDUCATION", heading)) | |
| for edu in data["education"]: | |
| story.append(aligned_row(f"<b>{edu['degree']}</b> — {edu['institution']}", edu.get("dates"))) | |
| if edu.get("specialization"): | |
| story.append(Paragraph(edu["specialization"], indent)) | |
| if edu.get("gpa"): | |
| story.append(Paragraph(f"GPA: {edu['gpa']}", indent)) | |
| story.append(Spacer(1, 5)) | |
| # === Skills === | |
| if data.get("skills"): | |
| story.append(Paragraph("SKILLS", heading)) | |
| for skill in data["skills"]: | |
| story.append(Paragraph(f"<b>{skill['category']}:</b> {', '.join(skill['technologies'])}", indent)) | |
| story.append(Spacer(1, 5)) | |
| # === Experience === | |
| if data.get("experience"): | |
| story.append(Paragraph("EXPERIENCE", heading)) | |
| for exp in data["experience"]: | |
| story.append(aligned_row(f"<b>{exp['title']}</b> — {exp['company']} ({exp.get('location','')})", exp.get("dates"))) | |
| if exp.get("position_details"): | |
| story.append(Paragraph(exp["position_details"], indent)) | |
| for desc in exp.get("enhanced", []): | |
| story.append(Paragraph(f"• {desc}", indent)) | |
| story.append(Spacer(1, 5)) | |
| # === Projects === | |
| if data.get("projects"): | |
| story.append(Paragraph("PROJECTS", heading)) | |
| for proj in data["projects"]: | |
| story.append(aligned_row(f"<b>{proj['name']}</b>", proj.get("dates"))) | |
| if proj.get("tech_stack"): | |
| story.append(Paragraph(f"Tech Stack: {proj['tech_stack']}", indent)) | |
| for desc in proj.get("enhanced", []): | |
| story.append(Paragraph(f"• {desc}", indent)) | |
| story.append(Spacer(1, 5)) | |
| doc.build(story) | |
| print(f"✅ Resume PDF created successfully: {output_path}") | |
| # === Main === | |
| def generate_pdf_resume(data: dict, output_path: str): | |
| data = enrich_bullets(data) | |
| create_pdf(data, output_path) | |
| # === Resume PDF Parser === | |
| # ------------------------------- | |
| # Pydantic Models | |
| # ------------------------------- | |
| class Experience(BaseModel): | |
| title: Optional[str] = None | |
| company: Optional[str] = None | |
| location: Optional[str] = None | |
| dates: Optional[str] = None | |
| description: Optional[List[str]] = None | |
| class Education(BaseModel): | |
| degree: Optional[str] = None | |
| institution: Optional[str] = None | |
| dates: Optional[str] = None | |
| gpa: Optional[str] = None | |
| details: Optional[List[str]] = None | |
| class SkillCategory(BaseModel): | |
| category: Optional[str] = None | |
| technologies: Optional[List[str]] = None | |
| class Project(BaseModel): | |
| name: Optional[str] = None | |
| description: Optional[List[str]] = None | |
| class Profile(BaseModel): | |
| location: Optional[str] = None | |
| phone: Optional[str] = None | |
| email: Optional[str] = None | |
| linkedin: Optional[str] = None | |
| github: Optional[str] = None | |
| class ResumeData(BaseModel): | |
| name: Optional[str] = None | |
| profile: Optional[Profile] = None | |
| summary: Optional[str] = "" | |
| experience: Optional[List[Experience]] = [] | |
| education: Optional[List[Education]] = [] | |
| skills: Optional[List[SkillCategory]] = [] | |
| projects: Optional[List[Project]] = [] | |
| certifications: Optional[List[str]] = [] | |
| publications: Optional[List[str]] = [] | |
| languages: Optional[List[str]] = [] | |
| # ------------------------------- | |
| # HELPER FUNCTIONS | |
| # ------------------------------- | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| text = "" | |
| try: | |
| with fitz.open(file_path) as doc: | |
| for page in doc: | |
| text += page.get_text() | |
| except Exception: | |
| with pdfplumber.open(file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() or "" | |
| return text.strip() | |
| def clean_json_output(output: str) -> str: | |
| cleaned = re.sub(r"```(?:json)?", "", output) | |
| cleaned = cleaned.replace("```", "").strip() | |
| return cleaned | |
| def classify_resume_with_gemini(resume_text: str) -> dict: | |
| prompt = f""" | |
| You are an expert resume parser. | |
| Given the resume text below, extract and classify the information into this JSON format: | |
| {{ | |
| "name": "string", | |
| "profile": {{ | |
| "location": "string", | |
| "phone": "string", | |
| "email": "string", | |
| "linkedin": "string", | |
| "github": "string" | |
| }}, | |
| "experience": [{{"title": "string","company": "string","location": "string","dates": "string","description": ["string"]}}], | |
| "education": [{{"degree": "string","institution": "string","dates": "string","gpa": "string","details": ["string"]}}], | |
| "skills": [{{"category": "string","technologies": ["string"]}}], | |
| "projects": [{{"name": "string","description": ["string"]}}], | |
| "certifications": ["string"], | |
| "publications": ["string"], | |
| "languages": ["string"] | |
| }} | |
| Resume Text: | |
| \"\"\"{resume_text}\"\"\" | |
| Return only valid JSON (no explanations or markdown). | |
| """ | |
| try: | |
| model = genai.GenerativeModel("gemini-2.5-flash") | |
| response = model.generate_content(prompt) | |
| raw_output = response.text.strip() | |
| cleaned_output = clean_json_output(raw_output) | |
| return json.loads(cleaned_output) | |
| except json.JSONDecodeError: | |
| raise HTTPException(status_code=500, detail="Gemini returned invalid JSON output.") | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Gemini API Error: {str(e)}") | |