Spaces:

Momal
/

cv-buddy-backend

Running

App Files Files Community

cv-buddy-backend / app /services /resume_parser.py

Momal

Deploy cv-buddy backend

366c43e about 2 months ago

raw

history blame contribute delete

3.13 kB

	from __future__ import annotations
	import io
	from pathlib import Path
	from typing import Dict, Any
	import fitz # PyMuPDF
	from docx import Document
	from app.models.resume import ResumeData
	from app.llm.factory import LLMFactory


	class ResumeParser:
	SUPPORTED_TYPES: Dict[str, str] = {
	"application/pdf": "pdf",
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
	}

	def __init__(self):
	self.prompts_dir = Path(__file__).parent.parent.parent / "prompts"

	def is_supported(self, content_type: str) -> bool:
	return content_type in self.SUPPORTED_TYPES

	def extract_text(self, file_bytes: bytes, content_type: str) -> str:
	file_type = self.SUPPORTED_TYPES.get(content_type)
	if file_type == "pdf":
	return self._extract_pdf(file_bytes)
	elif file_type == "docx":
	return self._extract_docx(file_bytes)
	else:
	raise ValueError(f"Unsupported content type: {content_type}")

	def _extract_pdf(self, file_bytes: bytes) -> str:
	try:
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	text_parts = []
	for page in doc:
	text_parts.append(page.get_text())
	doc.close()
	text = "\n".join(text_parts).strip()
	if not text:
	raise ValueError("Could not extract text from PDF")
	return text
	except Exception as e:
	raise ValueError(f"Could not extract text from PDF: {e}")

	def _extract_docx(self, file_bytes: bytes) -> str:
	try:
	doc = Document(io.BytesIO(file_bytes))
	text_parts = []
	for para in doc.paragraphs:
	if para.text.strip():
	text_parts.append(para.text)
	text = "\n".join(text_parts).strip()
	if not text:
	raise ValueError("Could not extract text from DOCX")
	return text
	except Exception as e:
	raise ValueError(f"Could not extract text from DOCX: {e}")

	async def parse(self, file_bytes: bytes, content_type: str) -> ResumeData:
	raw_text = self.extract_text(file_bytes, content_type)

	prompt_template = (self.prompts_dir / "structure_resume.txt").read_text()
	prompt = prompt_template.replace("{resume_text}", raw_text)

	schema: Dict[str, Any] = {
	"contact": {"name": "", "email": "", "phone": "", "linkedin": "", "location": ""},
	"summary": "",
	"experience": [{"company": "", "title": "", "dates": "", "bullets": []}],
	"education": [{"school": "", "degree": "", "dates": ""}],
	"skills": [],
	}

	llm = LLMFactory.get_fast()
	data = await llm.complete_json(prompt, schema)

	return ResumeData(
	contact=data.get("contact", {}),
	summary=data.get("summary", ""),
	experience=data.get("experience", []),
	education=data.get("education", []),
	skills=data.get("skills", []),
	raw_text=raw_text,
	)