Spaces:
Running
Running
| """Extract plain text from uploaded CV files (PDF / DOCX / TXT). | |
| Extraction libs are imported lazily so the app still loads if one is missing; | |
| the caller gets a clear error string instead of a crash. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import re | |
| def _clean(text: str) -> str: | |
| """Light normalisation mirroring the project's preprocessing.""" | |
| text = text.replace("\x00", " ") | |
| text = re.sub(r"\(cid:\d+\)", " ", text) # unmapped PDF glyphs (icons/ligatures) | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return "\n".join(line.strip() for line in text.splitlines()).strip() | |
| def _space_ratio(text: str) -> float: | |
| """Fraction of characters that are spaces. Normal prose ~0.12-0.18; | |
| PDFs with glued words ('UniversityofMalaya') drop near ~0.0.""" | |
| t = text.strip() | |
| return (t.count(" ") / len(t)) if t else 0.0 | |
| def _from_pdf(file) -> str: | |
| import pdfplumber | |
| def extract(pages, **kw): | |
| return "\n".join((p.extract_text(**kw) or "") for p in pages) | |
| with pdfplumber.open(file) as pdf: | |
| pages = pdf.pages | |
| text = extract(pages) | |
| # Some PDFs encode inter-word spaces as gaps smaller than pdfplumber's | |
| # default x_tolerance (3), so words come out glued together. Detect that | |
| # via a very low space ratio and re-extract with a tighter tolerance, | |
| # keeping it only if it genuinely adds spaces. | |
| if _space_ratio(text) < 0.08: | |
| tight = extract(pages, x_tolerance=1) | |
| if _space_ratio(tight) > _space_ratio(text): | |
| text = tight | |
| return text | |
| def _from_docx(file) -> str: | |
| import docx | |
| document = docx.Document(file) | |
| return "\n".join(p.text for p in document.paragraphs) | |
| def _from_txt(file) -> str: | |
| raw = file.read() | |
| if isinstance(raw, bytes): | |
| return raw.decode("utf-8", errors="ignore") | |
| return raw | |
| def extract_text(file, filename: str | None = None): | |
| """Return (text, error). Exactly one is non-empty. | |
| `file` is a file-like object (e.g. a Streamlit UploadedFile). | |
| """ | |
| name = filename or getattr(file, "name", "") or "" | |
| ext = os.path.splitext(name)[1].lower() | |
| try: | |
| if ext == ".pdf": | |
| text = _from_pdf(file) | |
| elif ext == ".docx": | |
| text = _from_docx(file) | |
| elif ext == ".txt": | |
| text = _from_txt(file) | |
| else: | |
| return "", f"Unsupported file type: {ext or '(none)'}" | |
| except ModuleNotFoundError as e: | |
| return "", (f"Missing library for {ext} files ({e.name}). " | |
| f"Install dashboard/requirements.txt.") | |
| except Exception as e: # noqa: BLE001 - surface any parse error to the UI | |
| return "", f"Could not read {name}: {e}" | |
| text = _clean(text) | |
| if not text: | |
| return "", f"No extractable text in {name} (scanned/image PDF?)." | |
| return text, "" | |