| """CV parser — reads PDF/DOCX/TXT and returns a structured research profile.""" |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
| from typing import TypedDict |
|
|
| from agent.base_service import BaseLLMService |
| from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT |
| from agent.utils import parse_json |
|
|
|
|
| class CVProfile(TypedDict, total=False): |
| name: str |
| contact: dict |
| summary: str |
| education: list[dict] |
| experience: list[dict] |
| research_interests: list[str] |
| publications: list[dict] |
| skills: dict |
| awards: list[str] |
| languages: list[dict] |
| references: list[dict] |
|
|
|
|
|
|
|
|
| class CVParser(BaseLLMService): |
| """Parses CV files into structured CVProfile dicts using an LLM.""" |
|
|
| _SYSTEM = CV_PARSER_SYSTEM |
|
|
| def parse(self, cv_path: str | Path) -> CVProfile: |
| """Parse a CV file and return a structured CVProfile. |
| |
| Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network |
| errors surface directly to the caller rather than silently returning |
| an empty profile. |
| """ |
| raw_text = self.extract_raw_text(cv_path) |
| if not raw_text.strip(): |
| raise ValueError("Could not extract any text from the CV file.") |
|
|
| prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000]) |
| raw_json = self._generate(prompt, json_mode=True) |
| result = parse_json(raw_json) |
| if result is None: |
| return {"name": "Unknown", "summary": raw_json[:500]} |
| return result |
|
|
| def summarize(self, profile: CVProfile) -> str: |
| """Build a compact text summary of a CVProfile for use in LLM prompts.""" |
| lines: list[str] = [] |
|
|
| if profile.get("name"): |
| lines.append(f"Name: {profile['name']}") |
|
|
| contact: dict = profile.get("contact") or {} |
| if contact.get("email"): |
| lines.append(f"Email: {contact['email']}") |
|
|
| if profile.get("summary"): |
| lines.append(f"Summary: {profile['summary']}") |
|
|
| research = profile.get("research_interests") or [] |
| if research: |
| lines.append(f"Research interests: {', '.join(research[:10])}") |
|
|
| for e in (profile.get("education") or [])[:3]: |
| thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else "" |
| lines.append( |
| f"Education: {e.get('degree', '')} in {e.get('field', '')} " |
| f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}" |
| ) |
|
|
| pubs = profile.get("publications") or [] |
| if pubs: |
| lines.append(f"Publications ({len(pubs)}):") |
| for p in pubs[:5]: |
| lines.append( |
| f" - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}" |
| ) |
|
|
| for e in (profile.get("experience") or [])[:4]: |
| lines.append( |
| f"Experience: {e.get('title', '')} at {e.get('institution', '')} " |
| f"({e.get('dates', '')})" |
| ) |
|
|
| skills: dict = profile.get("skills") or {} |
| all_skills = (skills.get("programming") or []) + (skills.get("tools") or []) |
| if all_skills: |
| lines.append(f"Technical skills: {', '.join(all_skills[:20])}") |
| lab = skills.get("lab_techniques") or [] |
| if lab: |
| lines.append(f"Lab techniques: {', '.join(lab[:10])}") |
|
|
| awards = profile.get("awards") or [] |
| if awards: |
| lines.append(f"Awards: {'; '.join(awards[:5])}") |
|
|
| langs = profile.get("languages") or [] |
| if langs: |
| lines.append( |
| "Languages: " + ", ".join( |
| f"{la.get('language', '')} ({la.get('level', '')})" for la in langs |
| ) |
| ) |
|
|
| return "\n".join(lines) |
|
|
| |
| |
| |
|
|
| @staticmethod |
| def extract_raw_text(cv_path: str | Path) -> str: |
| """Extract raw text from a CV file (.pdf, .docx, .txt).""" |
| path = Path(cv_path) |
| if not path.exists(): |
| raise FileNotFoundError(f"CV file not found: {path}") |
| suffix = path.suffix.lower() |
| if suffix == ".pdf": |
| return CVParser._from_pdf(path) |
| elif suffix in (".docx", ".doc"): |
| return CVParser._from_docx(path) |
| elif suffix == ".txt": |
| return path.read_text(encoding="utf-8", errors="replace") |
| raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.") |
|
|
| @staticmethod |
| def _from_pdf(path: Path) -> str: |
| import pdfplumber |
| pages: list[str] = [] |
| with pdfplumber.open(path) as pdf: |
| for page in pdf.pages: |
| text = page.extract_text() |
| if text: |
| pages.append(text) |
| return "\n".join(pages) |
|
|
| @staticmethod |
| def _from_docx(path: Path) -> str: |
| from docx import Document |
| doc = Document(str(path)) |
| paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] |
| for table in doc.tables: |
| for row in table.rows: |
| for cell in row.cells: |
| if cell.text.strip(): |
| paragraphs.append(cell.text.strip()) |
| return "\n".join(paragraphs) |
|
|