Spaces:

HipFil98
/

PhDScout

Sleeping

File size: 5,745 Bytes

"""CV parser — reads PDF/DOCX/TXT and returns a structured research profile."""

from __future__ import annotations

from pathlib import Path
from typing import TypedDict

from agent.base_service import BaseLLMService
from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT
from agent.utils import parse_json


class CVProfile(TypedDict, total=False):
    name: str
    contact: dict                  # email, phone, linkedin, github, website
    summary: str
    education: list[dict]          # degree, institution, field, year, thesis_topic
    experience: list[dict]         # title, institution, dates, description
    research_interests: list[str]
    publications: list[dict]       # title, venue, year, authors
    skills: dict                   # programming, tools, languages, lab_techniques
    awards: list[str]
    languages: list[dict]          # language, level
    references: list[dict]




class CVParser(BaseLLMService):
    """Parses CV files into structured CVProfile dicts using an LLM."""

    _SYSTEM = CV_PARSER_SYSTEM

    def parse(self, cv_path: str | Path) -> CVProfile:
        """Parse a CV file and return a structured CVProfile.

        Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network
        errors surface directly to the caller rather than silently returning
        an empty profile.
        """
        raw_text = self.extract_raw_text(cv_path)
        if not raw_text.strip():
            raise ValueError("Could not extract any text from the CV file.")

        prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000])
        raw_json = self._generate(prompt, json_mode=True)
        result = parse_json(raw_json)
        if result is None:
            return {"name": "Unknown", "summary": raw_json[:500]}
        return result

    def summarize(self, profile: CVProfile) -> str:
        """Build a compact text summary of a CVProfile for use in LLM prompts."""
        lines: list[str] = []

        if profile.get("name"):
            lines.append(f"Name: {profile['name']}")

        contact: dict = profile.get("contact") or {}
        if contact.get("email"):
            lines.append(f"Email: {contact['email']}")

        if profile.get("summary"):
            lines.append(f"Summary: {profile['summary']}")

        research = profile.get("research_interests") or []
        if research:
            lines.append(f"Research interests: {', '.join(research[:10])}")

        for e in (profile.get("education") or [])[:3]:
            thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else ""
            lines.append(
                f"Education: {e.get('degree', '')} in {e.get('field', '')} "
                f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}"
            )

        pubs = profile.get("publications") or []
        if pubs:
            lines.append(f"Publications ({len(pubs)}):")
            for p in pubs[:5]:
                lines.append(
                    f"  - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}"
                )

        for e in (profile.get("experience") or [])[:4]:
            lines.append(
                f"Experience: {e.get('title', '')} at {e.get('institution', '')} "
                f"({e.get('dates', '')})"
            )

        skills: dict = profile.get("skills") or {}
        all_skills = (skills.get("programming") or []) + (skills.get("tools") or [])
        if all_skills:
            lines.append(f"Technical skills: {', '.join(all_skills[:20])}")
        lab = skills.get("lab_techniques") or []
        if lab:
            lines.append(f"Lab techniques: {', '.join(lab[:10])}")

        awards = profile.get("awards") or []
        if awards:
            lines.append(f"Awards: {'; '.join(awards[:5])}")

        langs = profile.get("languages") or []
        if langs:
            lines.append(
                "Languages: " + ", ".join(
                    f"{la.get('language', '')} ({la.get('level', '')})" for la in langs
                )
            )

        return "\n".join(lines)

    # ------------------------------------------------------------------
    # Static helpers — raw text extraction
    # ------------------------------------------------------------------

    @staticmethod
    def extract_raw_text(cv_path: str | Path) -> str:
        """Extract raw text from a CV file (.pdf, .docx, .txt)."""
        path = Path(cv_path)
        if not path.exists():
            raise FileNotFoundError(f"CV file not found: {path}")
        suffix = path.suffix.lower()
        if suffix == ".pdf":
            return CVParser._from_pdf(path)
        elif suffix in (".docx", ".doc"):
            return CVParser._from_docx(path)
        elif suffix == ".txt":
            return path.read_text(encoding="utf-8", errors="replace")
        raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.")

    @staticmethod
    def _from_pdf(path: Path) -> str:
        import pdfplumber  # type: ignore
        pages: list[str] = []
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    pages.append(text)
        return "\n".join(pages)

    @staticmethod
    def _from_docx(path: Path) -> str:
        from docx import Document  # type: ignore
        doc = Document(str(path))
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        paragraphs.append(cell.text.strip())
        return "\n".join(paragraphs)