csob-matching / cv_parser.py
fihus444's picture
Initial deploy – CSOB AI Matching PoC
60730db
"""
cv_parser.py – Extrakce textu z CV (PDF/DOCX)
AI Matching Assistant for Open Positions (CSOB)
Podporuje:
- PDF (pymupdf/fitz)
- DOCX (python-docx)
- Plaintext fallback
Volitelne: odstraneni PII (jmena, telefony, emaily)
"""
import re
import sys
def extract_from_pdf(file_bytes: bytes) -> str:
"""Extrahuj text z PDF souboru."""
try:
import fitz # pymupdf
except ImportError:
print("CHYBA: pip install pymupdf")
return ""
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text_parts = []
for page in doc:
text_parts.append(page.get_text("text"))
doc.close()
return "\n".join(text_parts)
except Exception as e:
print(f"Chyba pri cteni PDF: {e}")
return ""
def extract_from_docx(file_bytes: bytes) -> str:
"""Extrahuj text z DOCX souboru."""
try:
from docx import Document
import io
except ImportError:
print("CHYBA: pip install python-docx")
return ""
try:
doc = Document(io.BytesIO(file_bytes))
text_parts = [para.text for para in doc.paragraphs if para.text.strip()]
return "\n".join(text_parts)
except Exception as e:
print(f"Chyba pri cteni DOCX: {e}")
return ""
def remove_pii(text: str) -> str:
"""
Odstran osobni udaje z textu CV.
Odebere: emaily, telefony, adresy (zakladni heuristika).
Ponecha: dovednosti, zkusenosti, vzdelani.
"""
# Email
text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text)
# Telefon (ruzne formaty CZ/SK/mezinarodni)
text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text)
# URL / LinkedIn
text = re.sub(r'https?://\S+', '[URL]', text)
text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text)
# Rodne cislo (CZ/SK format)
text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text)
return text
def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str:
"""
Hlavni funkce – extrahuje text z CV souboru.
Args:
file_bytes: obsah souboru jako bytes
filename: nazev souboru (pro detekci formatu)
remove_personal: zda odstranit PII
Returns:
Vycisteny text z CV
"""
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
if ext == "pdf":
text = extract_from_pdf(file_bytes)
elif ext in ("docx", "doc"):
text = extract_from_docx(file_bytes)
elif ext in ("txt", "md"):
text = file_bytes.decode("utf-8", errors="ignore")
else:
# Zkus jako text
try:
text = file_bytes.decode("utf-8", errors="ignore")
except Exception:
return ""
if not text.strip():
return ""
# Normalizace whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'[ \t]+', ' ', text)
text = text.strip()
# PII
if remove_personal:
text = remove_pii(text)
return text
def summarize_cv(text: str, max_chars: int = 1500) -> str:
"""
Zkrat CV text na rozumnou delku pro LLM kontext.
Zachova klicove sekce (dovednosti, zkusenosti, vzdelani).
"""
if len(text) <= max_chars:
return text
# Zkus najit klicove sekce
sections_priority = [
r"(?:dovednosti|skills|znalosti|kompetence)",
r"(?:zkušenosti|experience|praxe|pracovní)",
r"(?:vzdělání|education|škola|univerzita)",
r"(?:certifikace|certifikáty|certificates)",
r"(?:projekty|projects)",
]
important_parts = []
lines = text.split("\n")
in_important = False
for line in lines:
line_lower = line.lower().strip()
# Je to hlavicka dulezite sekce?
for pattern in sections_priority:
if re.search(pattern, line_lower, re.IGNORECASE):
in_important = True
break
# Prazdny radek = konec sekce (jednoducha heuristika)
if not line.strip():
if in_important and important_parts:
important_parts.append("")
in_important = False if len(important_parts) > 5 else in_important
continue
if in_important:
important_parts.append(line)
if important_parts and len("\n".join(important_parts)) > 100:
result = "\n".join(important_parts)
else:
result = text
# Oriznout na max_chars
if len(result) > max_chars:
result = result[:max_chars].rsplit(" ", 1)[0] + "..."
return result