PhDScout / agent /cv /parser.py
HipFil98's picture
refactor: reorganize agent/ into functional subfolders + split prompts
a77e42f
"""CV parser — reads PDF/DOCX/TXT and returns a structured research profile."""
from __future__ import annotations
from pathlib import Path
from typing import TypedDict
from agent.base_service import BaseLLMService
from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT
from agent.utils import parse_json
class CVProfile(TypedDict, total=False):
name: str
contact: dict # email, phone, linkedin, github, website
summary: str
education: list[dict] # degree, institution, field, year, thesis_topic
experience: list[dict] # title, institution, dates, description
research_interests: list[str]
publications: list[dict] # title, venue, year, authors
skills: dict # programming, tools, languages, lab_techniques
awards: list[str]
languages: list[dict] # language, level
references: list[dict]
class CVParser(BaseLLMService):
"""Parses CV files into structured CVProfile dicts using an LLM."""
_SYSTEM = CV_PARSER_SYSTEM
def parse(self, cv_path: str | Path) -> CVProfile:
"""Parse a CV file and return a structured CVProfile.
Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network
errors surface directly to the caller rather than silently returning
an empty profile.
"""
raw_text = self.extract_raw_text(cv_path)
if not raw_text.strip():
raise ValueError("Could not extract any text from the CV file.")
prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000])
raw_json = self._generate(prompt, json_mode=True)
result = parse_json(raw_json)
if result is None:
return {"name": "Unknown", "summary": raw_json[:500]}
return result
def summarize(self, profile: CVProfile) -> str:
"""Build a compact text summary of a CVProfile for use in LLM prompts."""
lines: list[str] = []
if profile.get("name"):
lines.append(f"Name: {profile['name']}")
contact: dict = profile.get("contact") or {}
if contact.get("email"):
lines.append(f"Email: {contact['email']}")
if profile.get("summary"):
lines.append(f"Summary: {profile['summary']}")
research = profile.get("research_interests") or []
if research:
lines.append(f"Research interests: {', '.join(research[:10])}")
for e in (profile.get("education") or [])[:3]:
thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else ""
lines.append(
f"Education: {e.get('degree', '')} in {e.get('field', '')} "
f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}"
)
pubs = profile.get("publications") or []
if pubs:
lines.append(f"Publications ({len(pubs)}):")
for p in pubs[:5]:
lines.append(
f" - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}"
)
for e in (profile.get("experience") or [])[:4]:
lines.append(
f"Experience: {e.get('title', '')} at {e.get('institution', '')} "
f"({e.get('dates', '')})"
)
skills: dict = profile.get("skills") or {}
all_skills = (skills.get("programming") or []) + (skills.get("tools") or [])
if all_skills:
lines.append(f"Technical skills: {', '.join(all_skills[:20])}")
lab = skills.get("lab_techniques") or []
if lab:
lines.append(f"Lab techniques: {', '.join(lab[:10])}")
awards = profile.get("awards") or []
if awards:
lines.append(f"Awards: {'; '.join(awards[:5])}")
langs = profile.get("languages") or []
if langs:
lines.append(
"Languages: " + ", ".join(
f"{la.get('language', '')} ({la.get('level', '')})" for la in langs
)
)
return "\n".join(lines)
# ------------------------------------------------------------------
# Static helpers — raw text extraction
# ------------------------------------------------------------------
@staticmethod
def extract_raw_text(cv_path: str | Path) -> str:
"""Extract raw text from a CV file (.pdf, .docx, .txt)."""
path = Path(cv_path)
if not path.exists():
raise FileNotFoundError(f"CV file not found: {path}")
suffix = path.suffix.lower()
if suffix == ".pdf":
return CVParser._from_pdf(path)
elif suffix in (".docx", ".doc"):
return CVParser._from_docx(path)
elif suffix == ".txt":
return path.read_text(encoding="utf-8", errors="replace")
raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.")
@staticmethod
def _from_pdf(path: Path) -> str:
import pdfplumber # type: ignore
pages: list[str] = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
return "\n".join(pages)
@staticmethod
def _from_docx(path: Path) -> str:
from docx import Document # type: ignore
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
paragraphs.append(cell.text.strip())
return "\n".join(paragraphs)