File size: 5,745 Bytes
dbf3fc6
98a9c16
 
 
 
dbf3fc6
98a9c16
dbf3fc6
1309142
7419404
98a9c16
 
 
 
 
 
 
7419404
98a9c16
 
 
 
 
7419404
98a9c16
 
 
 
dbf3fc6
7419404
 
1309142
7419404
 
dbf3fc6
 
 
 
 
 
 
7419404
 
98a9c16
1309142
dbf3fc6
7419404
 
 
 
98a9c16
7419404
 
 
98a9c16
7419404
 
98a9c16
7419404
 
 
 
 
 
 
 
 
 
 
 
98a9c16
 
 
 
 
 
7419404
 
 
 
dbf3fc6
 
 
7419404
 
dbf3fc6
 
 
 
7419404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98a9c16
7419404
 
 
98a9c16
 
7419404
dbf3fc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""CV parser — reads PDF/DOCX/TXT and returns a structured research profile."""

from __future__ import annotations

from pathlib import Path
from typing import TypedDict

from agent.base_service import BaseLLMService
from agent.prompts import CV_PARSER_SYSTEM, CV_PARSER_PROMPT
from agent.utils import parse_json


class CVProfile(TypedDict, total=False):
    name: str
    contact: dict                  # email, phone, linkedin, github, website
    summary: str
    education: list[dict]          # degree, institution, field, year, thesis_topic
    experience: list[dict]         # title, institution, dates, description
    research_interests: list[str]
    publications: list[dict]       # title, venue, year, authors
    skills: dict                   # programming, tools, languages, lab_techniques
    awards: list[str]
    languages: list[dict]          # language, level
    references: list[dict]




class CVParser(BaseLLMService):
    """Parses CV files into structured CVProfile dicts using an LLM."""

    _SYSTEM = CV_PARSER_SYSTEM

    def parse(self, cv_path: str | Path) -> CVProfile:
        """Parse a CV file and return a structured CVProfile.

        Note: uses ``_generate`` (not ``_generate_json``) so that LLM/network
        errors surface directly to the caller rather than silently returning
        an empty profile.
        """
        raw_text = self.extract_raw_text(cv_path)
        if not raw_text.strip():
            raise ValueError("Could not extract any text from the CV file.")

        prompt = CV_PARSER_PROMPT.format(cv_text=raw_text[:8000])
        raw_json = self._generate(prompt, json_mode=True)
        result = parse_json(raw_json)
        if result is None:
            return {"name": "Unknown", "summary": raw_json[:500]}
        return result

    def summarize(self, profile: CVProfile) -> str:
        """Build a compact text summary of a CVProfile for use in LLM prompts."""
        lines: list[str] = []

        if profile.get("name"):
            lines.append(f"Name: {profile['name']}")

        contact: dict = profile.get("contact") or {}
        if contact.get("email"):
            lines.append(f"Email: {contact['email']}")

        if profile.get("summary"):
            lines.append(f"Summary: {profile['summary']}")

        research = profile.get("research_interests") or []
        if research:
            lines.append(f"Research interests: {', '.join(research[:10])}")

        for e in (profile.get("education") or [])[:3]:
            thesis = f" — Thesis: {e['thesis_topic']}" if e.get("thesis_topic") else ""
            lines.append(
                f"Education: {e.get('degree', '')} in {e.get('field', '')} "
                f"from {e.get('institution', '')} ({e.get('year', '')}){thesis}"
            )

        pubs = profile.get("publications") or []
        if pubs:
            lines.append(f"Publications ({len(pubs)}):")
            for p in pubs[:5]:
                lines.append(
                    f"  - \"{p.get('title', '')}\" — {p.get('venue', '')} {p.get('year', '')}"
                )

        for e in (profile.get("experience") or [])[:4]:
            lines.append(
                f"Experience: {e.get('title', '')} at {e.get('institution', '')} "
                f"({e.get('dates', '')})"
            )

        skills: dict = profile.get("skills") or {}
        all_skills = (skills.get("programming") or []) + (skills.get("tools") or [])
        if all_skills:
            lines.append(f"Technical skills: {', '.join(all_skills[:20])}")
        lab = skills.get("lab_techniques") or []
        if lab:
            lines.append(f"Lab techniques: {', '.join(lab[:10])}")

        awards = profile.get("awards") or []
        if awards:
            lines.append(f"Awards: {'; '.join(awards[:5])}")

        langs = profile.get("languages") or []
        if langs:
            lines.append(
                "Languages: " + ", ".join(
                    f"{la.get('language', '')} ({la.get('level', '')})" for la in langs
                )
            )

        return "\n".join(lines)

    # ------------------------------------------------------------------
    # Static helpers — raw text extraction
    # ------------------------------------------------------------------

    @staticmethod
    def extract_raw_text(cv_path: str | Path) -> str:
        """Extract raw text from a CV file (.pdf, .docx, .txt)."""
        path = Path(cv_path)
        if not path.exists():
            raise FileNotFoundError(f"CV file not found: {path}")
        suffix = path.suffix.lower()
        if suffix == ".pdf":
            return CVParser._from_pdf(path)
        elif suffix in (".docx", ".doc"):
            return CVParser._from_docx(path)
        elif suffix == ".txt":
            return path.read_text(encoding="utf-8", errors="replace")
        raise ValueError(f"Unsupported CV format '{suffix}'. Use .pdf, .docx, or .txt.")

    @staticmethod
    def _from_pdf(path: Path) -> str:
        import pdfplumber  # type: ignore
        pages: list[str] = []
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    pages.append(text)
        return "\n".join(pages)

    @staticmethod
    def _from_docx(path: Path) -> str:
        from docx import Document  # type: ignore
        doc = Document(str(path))
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        paragraphs.append(cell.text.strip())
        return "\n".join(paragraphs)