Spaces:
Running
Running
| """ | |
| cv_parser.py – Extrakce textu z CV (PDF/DOCX) | |
| AI Matching Assistant for Open Positions (CSOB) | |
| Podporuje: | |
| - PDF (pymupdf/fitz) | |
| - DOCX (python-docx) | |
| - Plaintext fallback | |
| Volitelne: odstraneni PII (jmena, telefony, emaily) | |
| """ | |
| import re | |
| import sys | |
| def extract_from_pdf(file_bytes: bytes) -> str: | |
| """Extrahuj text z PDF souboru.""" | |
| try: | |
| import fitz # pymupdf | |
| except ImportError: | |
| print("CHYBA: pip install pymupdf") | |
| return "" | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text_parts = [] | |
| for page in doc: | |
| text_parts.append(page.get_text("text")) | |
| doc.close() | |
| return "\n".join(text_parts) | |
| except Exception as e: | |
| print(f"Chyba pri cteni PDF: {e}") | |
| return "" | |
| def extract_from_docx(file_bytes: bytes) -> str: | |
| """Extrahuj text z DOCX souboru.""" | |
| try: | |
| from docx import Document | |
| import io | |
| except ImportError: | |
| print("CHYBA: pip install python-docx") | |
| return "" | |
| try: | |
| doc = Document(io.BytesIO(file_bytes)) | |
| text_parts = [para.text for para in doc.paragraphs if para.text.strip()] | |
| return "\n".join(text_parts) | |
| except Exception as e: | |
| print(f"Chyba pri cteni DOCX: {e}") | |
| return "" | |
| def remove_pii(text: str) -> str: | |
| """ | |
| Odstran osobni udaje z textu CV. | |
| Odebere: emaily, telefony, adresy (zakladni heuristika). | |
| Ponecha: dovednosti, zkusenosti, vzdelani. | |
| """ | |
| text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text) | |
| # Telefon (ruzne formaty CZ/SK/mezinarodni) | |
| text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text) | |
| # URL / LinkedIn | |
| text = re.sub(r'https?://\S+', '[URL]', text) | |
| text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text) | |
| # Rodne cislo (CZ/SK format) | |
| text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text) | |
| return text | |
| def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str: | |
| """ | |
| Hlavni funkce – extrahuje text z CV souboru. | |
| Args: | |
| file_bytes: obsah souboru jako bytes | |
| filename: nazev souboru (pro detekci formatu) | |
| remove_personal: zda odstranit PII | |
| Returns: | |
| Vycisteny text z CV | |
| """ | |
| ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else "" | |
| if ext == "pdf": | |
| text = extract_from_pdf(file_bytes) | |
| elif ext in ("docx", "doc"): | |
| text = extract_from_docx(file_bytes) | |
| elif ext in ("txt", "md"): | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| else: | |
| # Zkus jako text | |
| try: | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| except Exception: | |
| return "" | |
| if not text.strip(): | |
| return "" | |
| # Normalizace whitespace | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| text = text.strip() | |
| # PII | |
| if remove_personal: | |
| text = remove_pii(text) | |
| return text | |
| def summarize_cv(text: str, max_chars: int = 1500) -> str: | |
| """ | |
| Zkrat CV text na rozumnou delku pro LLM kontext. | |
| Zachova klicove sekce (dovednosti, zkusenosti, vzdelani). | |
| """ | |
| if len(text) <= max_chars: | |
| return text | |
| # Zkus najit klicove sekce | |
| sections_priority = [ | |
| r"(?:dovednosti|skills|znalosti|kompetence)", | |
| r"(?:zkušenosti|experience|praxe|pracovní)", | |
| r"(?:vzdělání|education|škola|univerzita)", | |
| r"(?:certifikace|certifikáty|certificates)", | |
| r"(?:projekty|projects)", | |
| ] | |
| important_parts = [] | |
| lines = text.split("\n") | |
| in_important = False | |
| for line in lines: | |
| line_lower = line.lower().strip() | |
| # Je to hlavicka dulezite sekce? | |
| for pattern in sections_priority: | |
| if re.search(pattern, line_lower, re.IGNORECASE): | |
| in_important = True | |
| break | |
| # Prazdny radek = konec sekce (jednoducha heuristika) | |
| if not line.strip(): | |
| if in_important and important_parts: | |
| important_parts.append("") | |
| in_important = False if len(important_parts) > 5 else in_important | |
| continue | |
| if in_important: | |
| important_parts.append(line) | |
| if important_parts and len("\n".join(important_parts)) > 100: | |
| result = "\n".join(important_parts) | |
| else: | |
| result = text | |
| # Oriznout na max_chars | |
| if len(result) > max_chars: | |
| result = result[:max_chars].rsplit(" ", 1)[0] + "..." | |
| return result | |