Spaces:

HipFil98
/

PhDScout

Sleeping

App Files Files Community

PhDScout / agent /cv /parser.py

HipFil98

refactor: reorganize agent/ into functional subfolders + split prompts

a77e42f 2 months ago

raw

history blame contribute delete

5.75 kB

	"""CV parser — reads PDF/DOCX/TXT and returns a structured research profile."""

	from __future__ import annotations

	from pathlib import Path
	from typing import TypedDict

	from agent.base_service import BaseLLMService
	from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT
	from agent.utils import parse_json


	class CVProfile(TypedDict, total=False):
	name: str
	contact: dict # email, phone, linkedin, github, website
	summary: str
	education: list[dict] # degree, institution, field, year, thesis_topic
	experience: list[dict] # title, institution, dates, description
	research_interests: list[str]
	publications: list[dict] # title, venue, year, authors
	skills: dict # programming, tools, languages, lab_techniques
	awards: list[str]
	languages: list[dict] # language, level
	references: list[dict]




	class CVParser(BaseLLMService):
	"""Parses CV files into structured CVProfile dicts using an LLM."""

	_SYSTEM = CV_PARSER_SYSTEM

	def parse(self, cv_path: str \| Path) -> CVProfile:
	"""Parse a CV file and return a structured CVProfile.

	Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network
	errors surface directly to the caller rather than silently returning
	an empty profile.
	"""
	raw_text = self.extract_raw_text(cv_path)
	if not raw_text.strip():
	raise ValueError("Could not extract any text from the CV file.")

	prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000])
	raw_json = self._generate(prompt, json_mode=True)
	result = parse_json(raw_json)
	if result is None:
	return {"name": "Unknown", "summary": raw_json[:500]}
	return result

	def summarize(self, profile: CVProfile) -> str:
	"""Build a compact text summary of a CVProfile for use in LLM prompts."""
	lines: list[str] = []

	if profile.get("name"):
	lines.append(f"Name: {profile['name']}")

	contact: dict = profile.get("contact") or {}
	if contact.get("email"):
	lines.append(f"Email: {contact['email']}")

	if profile.get("summary"):
	lines.append(f"Summary: {profile['summary']}")

	research = profile.get("research_interests") or []
	if research:
	lines.append(f"Research interests: {', '.join(research[:10])}")

	for e in (profile.get("education") or [])[:3]:
	thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else ""
	lines.append(
	f"Education: {e.get('degree', '')} in {e.get('field', '')} "
	f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}"
	)

	pubs = profile.get("publications") or []
	if pubs:
	lines.append(f"Publications ({len(pubs)}):")
	for p in pubs[:5]:
	lines.append(
	f" - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}"
	)

	for e in (profile.get("experience") or [])[:4]:
	lines.append(
	f"Experience: {e.get('title', '')} at {e.get('institution', '')} "
	f"({e.get('dates', '')})"
	)

	skills: dict = profile.get("skills") or {}
	all_skills = (skills.get("programming") or []) + (skills.get("tools") or [])
	if all_skills:
	lines.append(f"Technical skills: {', '.join(all_skills[:20])}")
	lab = skills.get("lab_techniques") or []
	if lab:
	lines.append(f"Lab techniques: {', '.join(lab[:10])}")

	awards = profile.get("awards") or []
	if awards:
	lines.append(f"Awards: {'; '.join(awards[:5])}")

	langs = profile.get("languages") or []
	if langs:
	lines.append(
	"Languages: " + ", ".join(
	f"{la.get('language', '')} ({la.get('level', '')})" for la in langs
	)
	)

	return "\n".join(lines)

	# ------------------------------------------------------------------
	# Static helpers — raw text extraction
	# ------------------------------------------------------------------

	@staticmethod
	def extract_raw_text(cv_path: str \| Path) -> str:
	"""Extract raw text from a CV file (.pdf, .docx, .txt)."""
	path = Path(cv_path)
	if not path.exists():
	raise FileNotFoundError(f"CV file not found: {path}")
	suffix = path.suffix.lower()
	if suffix == ".pdf":
	return CVParser._from_pdf(path)
	elif suffix in (".docx", ".doc"):
	return CVParser._from_docx(path)
	elif suffix == ".txt":
	return path.read_text(encoding="utf-8", errors="replace")
	raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.")

	@staticmethod
	def _from_pdf(path: Path) -> str:
	import pdfplumber # type: ignore
	pages: list[str] = []
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	pages.append(text)
	return "\n".join(pages)

	@staticmethod
	def _from_docx(path: Path) -> str:
	from docx import Document # type: ignore
	doc = Document(str(path))
	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	for table in doc.tables:
	for row in table.rows:
	for cell in row.cells:
	if cell.text.strip():
	paragraphs.append(cell.text.strip())
	return "\n".join(paragraphs)