cv-buddy-backend / app /services /resume_parser.py
Momal's picture
Deploy cv-buddy backend
366c43e
from __future__ import annotations
import io
from pathlib import Path
from typing import Dict, Any
import fitz # PyMuPDF
from docx import Document
from app.models.resume import ResumeData
from app.llm.factory import LLMFactory
class ResumeParser:
SUPPORTED_TYPES: Dict[str, str] = {
"application/pdf": "pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
}
def __init__(self):
self.prompts_dir = Path(__file__).parent.parent.parent / "prompts"
def is_supported(self, content_type: str) -> bool:
return content_type in self.SUPPORTED_TYPES
def extract_text(self, file_bytes: bytes, content_type: str) -> str:
file_type = self.SUPPORTED_TYPES.get(content_type)
if file_type == "pdf":
return self._extract_pdf(file_bytes)
elif file_type == "docx":
return self._extract_docx(file_bytes)
else:
raise ValueError(f"Unsupported content type: {content_type}")
def _extract_pdf(self, file_bytes: bytes) -> str:
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
text = "\n".join(text_parts).strip()
if not text:
raise ValueError("Could not extract text from PDF")
return text
except Exception as e:
raise ValueError(f"Could not extract text from PDF: {e}")
def _extract_docx(self, file_bytes: bytes) -> str:
try:
doc = Document(io.BytesIO(file_bytes))
text_parts = []
for para in doc.paragraphs:
if para.text.strip():
text_parts.append(para.text)
text = "\n".join(text_parts).strip()
if not text:
raise ValueError("Could not extract text from DOCX")
return text
except Exception as e:
raise ValueError(f"Could not extract text from DOCX: {e}")
async def parse(self, file_bytes: bytes, content_type: str) -> ResumeData:
raw_text = self.extract_text(file_bytes, content_type)
prompt_template = (self.prompts_dir / "structure_resume.txt").read_text()
prompt = prompt_template.replace("{resume_text}", raw_text)
schema: Dict[str, Any] = {
"contact": {"name": "", "email": "", "phone": "", "linkedin": "", "location": ""},
"summary": "",
"experience": [{"company": "", "title": "", "dates": "", "bullets": []}],
"education": [{"school": "", "degree": "", "dates": ""}],
"skills": [],
}
llm = LLMFactory.get_fast()
data = await llm.complete_json(prompt, schema)
return ResumeData(
contact=data.get("contact", {}),
summary=data.get("summary", ""),
experience=data.get("experience", []),
education=data.get("education", []),
skills=data.get("skills", []),
raw_text=raw_text,
)