""" cv_parser.py – Extrakce textu z CV (PDF/DOCX) AI Matching Assistant for Open Positions (CSOB) Podporuje: - PDF (pymupdf/fitz) - DOCX (python-docx) - Plaintext fallback Volitelne: odstraneni PII (jmena, telefony, emaily) """ import re import sys def extract_from_pdf(file_bytes: bytes) -> str: """Extrahuj text z PDF souboru.""" try: import fitz # pymupdf except ImportError: print("CHYBA: pip install pymupdf") return "" try: doc = fitz.open(stream=file_bytes, filetype="pdf") text_parts = [] for page in doc: text_parts.append(page.get_text("text")) doc.close() return "\n".join(text_parts) except Exception as e: print(f"Chyba pri cteni PDF: {e}") return "" def extract_from_docx(file_bytes: bytes) -> str: """Extrahuj text z DOCX souboru.""" try: from docx import Document import io except ImportError: print("CHYBA: pip install python-docx") return "" try: doc = Document(io.BytesIO(file_bytes)) text_parts = [para.text for para in doc.paragraphs if para.text.strip()] return "\n".join(text_parts) except Exception as e: print(f"Chyba pri cteni DOCX: {e}") return "" def remove_pii(text: str) -> str: """ Odstran osobni udaje z textu CV. Odebere: emaily, telefony, adresy (zakladni heuristika). Ponecha: dovednosti, zkusenosti, vzdelani. """ # Email text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text) # Telefon (ruzne formaty CZ/SK/mezinarodni) text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text) # URL / LinkedIn text = re.sub(r'https?://\S+', '[URL]', text) text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text) # Rodne cislo (CZ/SK format) text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text) return text def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str: """ Hlavni funkce – extrahuje text z CV souboru. Args: file_bytes: obsah souboru jako bytes filename: nazev souboru (pro detekci formatu) remove_personal: zda odstranit PII Returns: Vycisteny text z CV """ ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else "" if ext == "pdf": text = extract_from_pdf(file_bytes) elif ext in ("docx", "doc"): text = extract_from_docx(file_bytes) elif ext in ("txt", "md"): text = file_bytes.decode("utf-8", errors="ignore") else: # Zkus jako text try: text = file_bytes.decode("utf-8", errors="ignore") except Exception: return "" if not text.strip(): return "" # Normalizace whitespace text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'[ \t]+', ' ', text) text = text.strip() # PII if remove_personal: text = remove_pii(text) return text def summarize_cv(text: str, max_chars: int = 1500) -> str: """ Zkrat CV text na rozumnou delku pro LLM kontext. Zachova klicove sekce (dovednosti, zkusenosti, vzdelani). """ if len(text) <= max_chars: return text # Zkus najit klicove sekce sections_priority = [ r"(?:dovednosti|skills|znalosti|kompetence)", r"(?:zkušenosti|experience|praxe|pracovní)", r"(?:vzdělání|education|škola|univerzita)", r"(?:certifikace|certifikáty|certificates)", r"(?:projekty|projects)", ] important_parts = [] lines = text.split("\n") in_important = False for line in lines: line_lower = line.lower().strip() # Je to hlavicka dulezite sekce? for pattern in sections_priority: if re.search(pattern, line_lower, re.IGNORECASE): in_important = True break # Prazdny radek = konec sekce (jednoducha heuristika) if not line.strip(): if in_important and important_parts: important_parts.append("") in_important = False if len(important_parts) > 5 else in_important continue if in_important: important_parts.append(line) if important_parts and len("\n".join(important_parts)) > 100: result = "\n".join(important_parts) else: result = text # Oriznout na max_chars if len(result) > max_chars: result = result[:max_chars].rsplit(" ", 1)[0] + "..." return result