File size: 5,745 Bytes
dbf3fc6 98a9c16 dbf3fc6 98a9c16 dbf3fc6 1309142 7419404 98a9c16 7419404 98a9c16 7419404 98a9c16 dbf3fc6 7419404 1309142 7419404 dbf3fc6 7419404 98a9c16 1309142 dbf3fc6 7419404 98a9c16 7419404 98a9c16 7419404 98a9c16 7419404 98a9c16 7419404 dbf3fc6 7419404 dbf3fc6 7419404 98a9c16 7419404 98a9c16 7419404 dbf3fc6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """CV parser — reads PDF/DOCX/TXT and returns a structured research profile."""
from __future__ import annotations
from pathlib import Path
from typing import TypedDict
from agent.base_service import BaseLLMService
from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT
from agent.utils import parse_json
class CVProfile(TypedDict, total=False):
name: str
contact: dict # email, phone, linkedin, github, website
summary: str
education: list[dict] # degree, institution, field, year, thesis_topic
experience: list[dict] # title, institution, dates, description
research_interests: list[str]
publications: list[dict] # title, venue, year, authors
skills: dict # programming, tools, languages, lab_techniques
awards: list[str]
languages: list[dict] # language, level
references: list[dict]
class CVParser(BaseLLMService):
"""Parses CV files into structured CVProfile dicts using an LLM."""
_SYSTEM = CV_PARSER_SYSTEM
def parse(self, cv_path: str | Path) -> CVProfile:
"""Parse a CV file and return a structured CVProfile.
Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network
errors surface directly to the caller rather than silently returning
an empty profile.
"""
raw_text = self.extract_raw_text(cv_path)
if not raw_text.strip():
raise ValueError("Could not extract any text from the CV file.")
prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000])
raw_json = self._generate(prompt, json_mode=True)
result = parse_json(raw_json)
if result is None:
return {"name": "Unknown", "summary": raw_json[:500]}
return result
def summarize(self, profile: CVProfile) -> str:
"""Build a compact text summary of a CVProfile for use in LLM prompts."""
lines: list[str] = []
if profile.get("name"):
lines.append(f"Name: {profile['name']}")
contact: dict = profile.get("contact") or {}
if contact.get("email"):
lines.append(f"Email: {contact['email']}")
if profile.get("summary"):
lines.append(f"Summary: {profile['summary']}")
research = profile.get("research_interests") or []
if research:
lines.append(f"Research interests: {', '.join(research[:10])}")
for e in (profile.get("education") or [])[:3]:
thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else ""
lines.append(
f"Education: {e.get('degree', '')} in {e.get('field', '')} "
f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}"
)
pubs = profile.get("publications") or []
if pubs:
lines.append(f"Publications ({len(pubs)}):")
for p in pubs[:5]:
lines.append(
f" - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}"
)
for e in (profile.get("experience") or [])[:4]:
lines.append(
f"Experience: {e.get('title', '')} at {e.get('institution', '')} "
f"({e.get('dates', '')})"
)
skills: dict = profile.get("skills") or {}
all_skills = (skills.get("programming") or []) + (skills.get("tools") or [])
if all_skills:
lines.append(f"Technical skills: {', '.join(all_skills[:20])}")
lab = skills.get("lab_techniques") or []
if lab:
lines.append(f"Lab techniques: {', '.join(lab[:10])}")
awards = profile.get("awards") or []
if awards:
lines.append(f"Awards: {'; '.join(awards[:5])}")
langs = profile.get("languages") or []
if langs:
lines.append(
"Languages: " + ", ".join(
f"{la.get('language', '')} ({la.get('level', '')})" for la in langs
)
)
return "\n".join(lines)
# ------------------------------------------------------------------
# Static helpers — raw text extraction
# ------------------------------------------------------------------
@staticmethod
def extract_raw_text(cv_path: str | Path) -> str:
"""Extract raw text from a CV file (.pdf, .docx, .txt)."""
path = Path(cv_path)
if not path.exists():
raise FileNotFoundError(f"CV file not found: {path}")
suffix = path.suffix.lower()
if suffix == ".pdf":
return CVParser._from_pdf(path)
elif suffix in (".docx", ".doc"):
return CVParser._from_docx(path)
elif suffix == ".txt":
return path.read_text(encoding="utf-8", errors="replace")
raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.")
@staticmethod
def _from_pdf(path: Path) -> str:
import pdfplumber # type: ignore
pages: list[str] = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
return "\n".join(pages)
@staticmethod
def _from_docx(path: Path) -> str:
from docx import Document # type: ignore
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
paragraphs.append(cell.text.strip())
return "\n".join(paragraphs)
|