Spaces:
Running
Running
File size: 2,906 Bytes
c59578d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | """Extract plain text from uploaded CV files (PDF / DOCX / TXT).
Extraction libs are imported lazily so the app still loads if one is missing;
the caller gets a clear error string instead of a crash.
"""
from __future__ import annotations
import os
import re
def _clean(text: str) -> str:
"""Light normalisation mirroring the project's preprocessing."""
text = text.replace("\x00", " ")
text = re.sub(r"\(cid:\d+\)", " ", text) # unmapped PDF glyphs (icons/ligatures)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return "\n".join(line.strip() for line in text.splitlines()).strip()
def _space_ratio(text: str) -> float:
"""Fraction of characters that are spaces. Normal prose ~0.12-0.18;
PDFs with glued words ('UniversityofMalaya') drop near ~0.0."""
t = text.strip()
return (t.count(" ") / len(t)) if t else 0.0
def _from_pdf(file) -> str:
import pdfplumber
def extract(pages, **kw):
return "\n".join((p.extract_text(**kw) or "") for p in pages)
with pdfplumber.open(file) as pdf:
pages = pdf.pages
text = extract(pages)
# Some PDFs encode inter-word spaces as gaps smaller than pdfplumber's
# default x_tolerance (3), so words come out glued together. Detect that
# via a very low space ratio and re-extract with a tighter tolerance,
# keeping it only if it genuinely adds spaces.
if _space_ratio(text) < 0.08:
tight = extract(pages, x_tolerance=1)
if _space_ratio(tight) > _space_ratio(text):
text = tight
return text
def _from_docx(file) -> str:
import docx
document = docx.Document(file)
return "\n".join(p.text for p in document.paragraphs)
def _from_txt(file) -> str:
raw = file.read()
if isinstance(raw, bytes):
return raw.decode("utf-8", errors="ignore")
return raw
def extract_text(file, filename: str | None = None):
"""Return (text, error). Exactly one is non-empty.
`file` is a file-like object (e.g. a Streamlit UploadedFile).
"""
name = filename or getattr(file, "name", "") or ""
ext = os.path.splitext(name)[1].lower()
try:
if ext == ".pdf":
text = _from_pdf(file)
elif ext == ".docx":
text = _from_docx(file)
elif ext == ".txt":
text = _from_txt(file)
else:
return "", f"Unsupported file type: {ext or '(none)'}"
except ModuleNotFoundError as e:
return "", (f"Missing library for {ext} files ({e.name}). "
f"Install dashboard/requirements.txt.")
except Exception as e: # noqa: BLE001 - surface any parse error to the UI
return "", f"Could not read {name}: {e}"
text = _clean(text)
if not text:
return "", f"No extractable text in {name} (scanned/image PDF?)."
return text, ""
|