"""Extract plain text from uploaded CV files (PDF / DOCX / TXT). Extraction libs are imported lazily so the app still loads if one is missing; the caller gets a clear error string instead of a crash. """ from __future__ import annotations import os import re def _clean(text: str) -> str: """Light normalisation mirroring the project's preprocessing.""" text = text.replace("\x00", " ") text = re.sub(r"\(cid:\d+\)", " ", text) # unmapped PDF glyphs (icons/ligatures) text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return "\n".join(line.strip() for line in text.splitlines()).strip() def _space_ratio(text: str) -> float: """Fraction of characters that are spaces. Normal prose ~0.12-0.18; PDFs with glued words ('UniversityofMalaya') drop near ~0.0.""" t = text.strip() return (t.count(" ") / len(t)) if t else 0.0 def _from_pdf(file) -> str: import pdfplumber def extract(pages, **kw): return "\n".join((p.extract_text(**kw) or "") for p in pages) with pdfplumber.open(file) as pdf: pages = pdf.pages text = extract(pages) # Some PDFs encode inter-word spaces as gaps smaller than pdfplumber's # default x_tolerance (3), so words come out glued together. Detect that # via a very low space ratio and re-extract with a tighter tolerance, # keeping it only if it genuinely adds spaces. if _space_ratio(text) < 0.08: tight = extract(pages, x_tolerance=1) if _space_ratio(tight) > _space_ratio(text): text = tight return text def _from_docx(file) -> str: import docx document = docx.Document(file) return "\n".join(p.text for p in document.paragraphs) def _from_txt(file) -> str: raw = file.read() if isinstance(raw, bytes): return raw.decode("utf-8", errors="ignore") return raw def extract_text(file, filename: str | None = None): """Return (text, error). Exactly one is non-empty. `file` is a file-like object (e.g. a Streamlit UploadedFile). """ name = filename or getattr(file, "name", "") or "" ext = os.path.splitext(name)[1].lower() try: if ext == ".pdf": text = _from_pdf(file) elif ext == ".docx": text = _from_docx(file) elif ext == ".txt": text = _from_txt(file) else: return "", f"Unsupported file type: {ext or '(none)'}" except ModuleNotFoundError as e: return "", (f"Missing library for {ext} files ({e.name}). " f"Install dashboard/requirements.txt.") except Exception as e: # noqa: BLE001 - surface any parse error to the UI return "", f"Could not read {name}: {e}" text = _clean(text) if not text: return "", f"No extractable text in {name} (scanned/image PDF?)." return text, ""