Spaces:

fihus
/

csob-matching

Running

File size: 4,594 Bytes

60730db

"""
cv_parser.py – Extrakce textu z CV (PDF/DOCX)
AI Matching Assistant for Open Positions (CSOB)

Podporuje:
  - PDF (pymupdf/fitz)
  - DOCX (python-docx)
  - Plaintext fallback

Volitelne: odstraneni PII (jmena, telefony, emaily)
"""

import re
import sys


def extract_from_pdf(file_bytes: bytes) -> str:
    """Extrahuj text z PDF souboru."""
    try:
        import fitz  # pymupdf
    except ImportError:
        print("CHYBA: pip install pymupdf")
        return ""

    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text_parts = []
        for page in doc:
            text_parts.append(page.get_text("text"))
        doc.close()
        return "\n".join(text_parts)
    except Exception as e:
        print(f"Chyba pri cteni PDF: {e}")
        return ""


def extract_from_docx(file_bytes: bytes) -> str:
    """Extrahuj text z DOCX souboru."""
    try:
        from docx import Document
        import io
    except ImportError:
        print("CHYBA: pip install python-docx")
        return ""

    try:
        doc = Document(io.BytesIO(file_bytes))
        text_parts = [para.text for para in doc.paragraphs if para.text.strip()]
        return "\n".join(text_parts)
    except Exception as e:
        print(f"Chyba pri cteni DOCX: {e}")
        return ""


def remove_pii(text: str) -> str:
    """
    Odstran osobni udaje z textu CV.
    Odebere: emaily, telefony, adresy (zakladni heuristika).
    Ponecha: dovednosti, zkusenosti, vzdelani.
    """
    # Email
    text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text)

    # Telefon (ruzne formaty CZ/SK/mezinarodni)
    text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text)

    # URL / LinkedIn
    text = re.sub(r'https?://\S+', '[URL]', text)
    text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text)

    # Rodne cislo (CZ/SK format)
    text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text)

    return text


def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str:
    """
    Hlavni funkce – extrahuje text z CV souboru.

    Args:
        file_bytes: obsah souboru jako bytes
        filename: nazev souboru (pro detekci formatu)
        remove_personal: zda odstranit PII

    Returns:
        Vycisteny text z CV
    """
    ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""

    if ext == "pdf":
        text = extract_from_pdf(file_bytes)
    elif ext in ("docx", "doc"):
        text = extract_from_docx(file_bytes)
    elif ext in ("txt", "md"):
        text = file_bytes.decode("utf-8", errors="ignore")
    else:
        # Zkus jako text
        try:
            text = file_bytes.decode("utf-8", errors="ignore")
        except Exception:
            return ""

    if not text.strip():
        return ""

    # Normalizace whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = text.strip()

    # PII
    if remove_personal:
        text = remove_pii(text)

    return text


def summarize_cv(text: str, max_chars: int = 1500) -> str:
    """
    Zkrat CV text na rozumnou delku pro LLM kontext.
    Zachova klicove sekce (dovednosti, zkusenosti, vzdelani).
    """
    if len(text) <= max_chars:
        return text

    # Zkus najit klicove sekce
    sections_priority = [
        r"(?:dovednosti|skills|znalosti|kompetence)",
        r"(?:zkušenosti|experience|praxe|pracovní)",
        r"(?:vzdělání|education|škola|univerzita)",
        r"(?:certifikace|certifikáty|certificates)",
        r"(?:projekty|projects)",
    ]

    important_parts = []
    lines = text.split("\n")

    in_important = False
    for line in lines:
        line_lower = line.lower().strip()

        # Je to hlavicka dulezite sekce?
        for pattern in sections_priority:
            if re.search(pattern, line_lower, re.IGNORECASE):
                in_important = True
                break

        # Prazdny radek = konec sekce (jednoducha heuristika)
        if not line.strip():
            if in_important and important_parts:
                important_parts.append("")
            in_important = False if len(important_parts) > 5 else in_important
            continue

        if in_important:
            important_parts.append(line)

    if important_parts and len("\n".join(important_parts)) > 100:
        result = "\n".join(important_parts)
    else:
        result = text

    # Oriznout na max_chars
    if len(result) > max_chars:
        result = result[:max_chars].rsplit(" ", 1)[0] + "..."

    return result