File size: 4,594 Bytes
60730db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
cv_parser.py – Extrakce textu z CV (PDF/DOCX)
AI Matching Assistant for Open Positions (CSOB)

Podporuje:
  - PDF (pymupdf/fitz)
  - DOCX (python-docx)
  - Plaintext fallback

Volitelne: odstraneni PII (jmena, telefony, emaily)
"""

import re
import sys


def extract_from_pdf(file_bytes: bytes) -> str:
    """Extrahuj text z PDF souboru."""
    try:
        import fitz  # pymupdf
    except ImportError:
        print("CHYBA: pip install pymupdf")
        return ""

    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text_parts = []
        for page in doc:
            text_parts.append(page.get_text("text"))
        doc.close()
        return "\n".join(text_parts)
    except Exception as e:
        print(f"Chyba pri cteni PDF: {e}")
        return ""


def extract_from_docx(file_bytes: bytes) -> str:
    """Extrahuj text z DOCX souboru."""
    try:
        from docx import Document
        import io
    except ImportError:
        print("CHYBA: pip install python-docx")
        return ""

    try:
        doc = Document(io.BytesIO(file_bytes))
        text_parts = [para.text for para in doc.paragraphs if para.text.strip()]
        return "\n".join(text_parts)
    except Exception as e:
        print(f"Chyba pri cteni DOCX: {e}")
        return ""


def remove_pii(text: str) -> str:
    """
    Odstran osobni udaje z textu CV.
    Odebere: emaily, telefony, adresy (zakladni heuristika).
    Ponecha: dovednosti, zkusenosti, vzdelani.
    """
    # Email
    text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text)

    # Telefon (ruzne formaty CZ/SK/mezinarodni)
    text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text)

    # URL / LinkedIn
    text = re.sub(r'https?://\S+', '[URL]', text)
    text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text)

    # Rodne cislo (CZ/SK format)
    text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text)

    return text


def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str:
    """
    Hlavni funkce – extrahuje text z CV souboru.

    Args:
        file_bytes: obsah souboru jako bytes
        filename: nazev souboru (pro detekci formatu)
        remove_personal: zda odstranit PII

    Returns:
        Vycisteny text z CV
    """
    ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""

    if ext == "pdf":
        text = extract_from_pdf(file_bytes)
    elif ext in ("docx", "doc"):
        text = extract_from_docx(file_bytes)
    elif ext in ("txt", "md"):
        text = file_bytes.decode("utf-8", errors="ignore")
    else:
        # Zkus jako text
        try:
            text = file_bytes.decode("utf-8", errors="ignore")
        except Exception:
            return ""

    if not text.strip():
        return ""

    # Normalizace whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = text.strip()

    # PII
    if remove_personal:
        text = remove_pii(text)

    return text


def summarize_cv(text: str, max_chars: int = 1500) -> str:
    """
    Zkrat CV text na rozumnou delku pro LLM kontext.
    Zachova klicove sekce (dovednosti, zkusenosti, vzdelani).
    """
    if len(text) <= max_chars:
        return text

    # Zkus najit klicove sekce
    sections_priority = [
        r"(?:dovednosti|skills|znalosti|kompetence)",
        r"(?:zkušenosti|experience|praxe|pracovní)",
        r"(?:vzdělání|education|škola|univerzita)",
        r"(?:certifikace|certifikáty|certificates)",
        r"(?:projekty|projects)",
    ]

    important_parts = []
    lines = text.split("\n")

    in_important = False
    for line in lines:
        line_lower = line.lower().strip()

        # Je to hlavicka dulezite sekce?
        for pattern in sections_priority:
            if re.search(pattern, line_lower, re.IGNORECASE):
                in_important = True
                break

        # Prazdny radek = konec sekce (jednoducha heuristika)
        if not line.strip():
            if in_important and important_parts:
                important_parts.append("")
            in_important = False if len(important_parts) > 5 else in_important
            continue

        if in_important:
            important_parts.append(line)

    if important_parts and len("\n".join(important_parts)) > 100:
        result = "\n".join(important_parts)
    else:
        result = text

    # Oriznout na max_chars
    if len(result) > max_chars:
        result = result[:max_chars].rsplit(" ", 1)[0] + "..."

    return result