Spaces:
Running
Running
| from __future__ import annotations | |
| import io | |
| from pathlib import Path | |
| from typing import Dict, Any | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| from app.models.resume import ResumeData | |
| from app.llm.factory import LLMFactory | |
| class ResumeParser: | |
| SUPPORTED_TYPES: Dict[str, str] = { | |
| "application/pdf": "pdf", | |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", | |
| } | |
| def __init__(self): | |
| self.prompts_dir = Path(__file__).parent.parent.parent / "prompts" | |
| def is_supported(self, content_type: str) -> bool: | |
| return content_type in self.SUPPORTED_TYPES | |
| def extract_text(self, file_bytes: bytes, content_type: str) -> str: | |
| file_type = self.SUPPORTED_TYPES.get(content_type) | |
| if file_type == "pdf": | |
| return self._extract_pdf(file_bytes) | |
| elif file_type == "docx": | |
| return self._extract_docx(file_bytes) | |
| else: | |
| raise ValueError(f"Unsupported content type: {content_type}") | |
| def _extract_pdf(self, file_bytes: bytes) -> str: | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text_parts = [] | |
| for page in doc: | |
| text_parts.append(page.get_text()) | |
| doc.close() | |
| text = "\n".join(text_parts).strip() | |
| if not text: | |
| raise ValueError("Could not extract text from PDF") | |
| return text | |
| except Exception as e: | |
| raise ValueError(f"Could not extract text from PDF: {e}") | |
| def _extract_docx(self, file_bytes: bytes) -> str: | |
| try: | |
| doc = Document(io.BytesIO(file_bytes)) | |
| text_parts = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| text_parts.append(para.text) | |
| text = "\n".join(text_parts).strip() | |
| if not text: | |
| raise ValueError("Could not extract text from DOCX") | |
| return text | |
| except Exception as e: | |
| raise ValueError(f"Could not extract text from DOCX: {e}") | |
| async def parse(self, file_bytes: bytes, content_type: str) -> ResumeData: | |
| raw_text = self.extract_text(file_bytes, content_type) | |
| prompt_template = (self.prompts_dir / "structure_resume.txt").read_text() | |
| prompt = prompt_template.replace("{resume_text}", raw_text) | |
| schema: Dict[str, Any] = { | |
| "contact": {"name": "", "email": "", "phone": "", "linkedin": "", "location": ""}, | |
| "summary": "", | |
| "experience": [{"company": "", "title": "", "dates": "", "bullets": []}], | |
| "education": [{"school": "", "degree": "", "dates": ""}], | |
| "skills": [], | |
| } | |
| llm = LLMFactory.get_fast() | |
| data = await llm.complete_json(prompt, schema) | |
| return ResumeData( | |
| contact=data.get("contact", {}), | |
| summary=data.get("summary", ""), | |
| experience=data.get("experience", []), | |
| education=data.get("education", []), | |
| skills=data.get("skills", []), | |
| raw_text=raw_text, | |
| ) | |