Spaces:

fihus
/

csob-matching

Running

App Files Files Community

csob-matching / cv_parser.py

fihus444

Initial deploy – CSOB AI Matching PoC

60730db 20 days ago

raw

history blame contribute delete

4.59 kB

	"""
	cv_parser.py – Extrakce textu z CV (PDF/DOCX)
	AI Matching Assistant for Open Positions (CSOB)

	Podporuje:
	- PDF (pymupdf/fitz)
	- DOCX (python-docx)
	- Plaintext fallback

	Volitelne: odstraneni PII (jmena, telefony, emaily)
	"""

	import re
	import sys


	def extract_from_pdf(file_bytes: bytes) -> str:
	"""Extrahuj text z PDF souboru."""
	try:
	import fitz # pymupdf
	except ImportError:
	print("CHYBA: pip install pymupdf")
	return ""

	try:
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	text_parts = []
	for page in doc:
	text_parts.append(page.get_text("text"))
	doc.close()
	return "\n".join(text_parts)
	except Exception as e:
	print(f"Chyba pri cteni PDF: {e}")
	return ""


	def extract_from_docx(file_bytes: bytes) -> str:
	"""Extrahuj text z DOCX souboru."""
	try:
	from docx import Document
	import io
	except ImportError:
	print("CHYBA: pip install python-docx")
	return ""

	try:
	doc = Document(io.BytesIO(file_bytes))
	text_parts = [para.text for para in doc.paragraphs if para.text.strip()]
	return "\n".join(text_parts)
	except Exception as e:
	print(f"Chyba pri cteni DOCX: {e}")
	return ""


	def remove_pii(text: str) -> str:
	"""
	Odstran osobni udaje z textu CV.
	Odebere: emaily, telefony, adresy (zakladni heuristika).
	Ponecha: dovednosti, zkusenosti, vzdelani.
	"""
	# Email
	text = re.sub(r'\b[\w.+-]+@[\w.-]+\.\w{2,}\b', '[EMAIL]', text)

	# Telefon (ruzne formaty CZ/SK/mezinarodni)
	text = re.sub(r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{3,4}', '[TELEFON]', text)

	# URL / LinkedIn
	text = re.sub(r'https?://\S+', '[URL]', text)
	text = re.sub(r'linkedin\.com/\S+', '[LINKEDIN]', text)

	# Rodne cislo (CZ/SK format)
	text = re.sub(r'\b\d{6}/?\d{3,4}\b', '[RC]', text)

	return text


	def extract_cv_text(file_bytes: bytes, filename: str, remove_personal: bool = True) -> str:
	"""
	Hlavni funkce – extrahuje text z CV souboru.

	Args:
	file_bytes: obsah souboru jako bytes
	filename: nazev souboru (pro detekci formatu)
	remove_personal: zda odstranit PII

	Returns:
	Vycisteny text z CV
	"""
	ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""

	if ext == "pdf":
	text = extract_from_pdf(file_bytes)
	elif ext in ("docx", "doc"):
	text = extract_from_docx(file_bytes)
	elif ext in ("txt", "md"):
	text = file_bytes.decode("utf-8", errors="ignore")
	else:
	# Zkus jako text
	try:
	text = file_bytes.decode("utf-8", errors="ignore")
	except Exception:
	return ""

	if not text.strip():
	return ""

	# Normalizace whitespace
	text = re.sub(r'\n{3,}', '\n\n', text)
	text = re.sub(r'[ \t]+', ' ', text)
	text = text.strip()

	# PII
	if remove_personal:
	text = remove_pii(text)

	return text


	def summarize_cv(text: str, max_chars: int = 1500) -> str:
	"""
	Zkrat CV text na rozumnou delku pro LLM kontext.
	Zachova klicove sekce (dovednosti, zkusenosti, vzdelani).
	"""
	if len(text) <= max_chars:
	return text

	# Zkus najit klicove sekce
	sections_priority = [
	r"(?:dovednosti\|skills\|znalosti\|kompetence)",
	r"(?:zkušenosti\|experience\|praxe\|pracovní)",
	r"(?:vzdělání\|education\|škola\|univerzita)",
	r"(?:certifikace\|certifikáty\|certificates)",
	r"(?:projekty\|projects)",
	]

	important_parts = []
	lines = text.split("\n")

	in_important = False
	for line in lines:
	line_lower = line.lower().strip()

	# Je to hlavicka dulezite sekce?
	for pattern in sections_priority:
	if re.search(pattern, line_lower, re.IGNORECASE):
	in_important = True
	break

	# Prazdny radek = konec sekce (jednoducha heuristika)
	if not line.strip():
	if in_important and important_parts:
	important_parts.append("")
	in_important = False if len(important_parts) > 5 else in_important
	continue

	if in_important:
	important_parts.append(line)

	if important_parts and len("\n".join(important_parts)) > 100:
	result = "\n".join(important_parts)
	else:
	result = text

	# Oriznout na max_chars
	if len(result) > max_chars:
	result = result[:max_chars].rsplit(" ", 1)[0] + "..."

	return result