File size: 2,906 Bytes
c59578d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Extract plain text from uploaded CV files (PDF / DOCX / TXT).

Extraction libs are imported lazily so the app still loads if one is missing;
the caller gets a clear error string instead of a crash.
"""
from __future__ import annotations

import os
import re


def _clean(text: str) -> str:
    """Light normalisation mirroring the project's preprocessing."""
    text = text.replace("\x00", " ")
    text = re.sub(r"\(cid:\d+\)", " ", text)   # unmapped PDF glyphs (icons/ligatures)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return "\n".join(line.strip() for line in text.splitlines()).strip()


def _space_ratio(text: str) -> float:
    """Fraction of characters that are spaces. Normal prose ~0.12-0.18;
    PDFs with glued words ('UniversityofMalaya') drop near ~0.0."""
    t = text.strip()
    return (t.count(" ") / len(t)) if t else 0.0


def _from_pdf(file) -> str:
    import pdfplumber

    def extract(pages, **kw):
        return "\n".join((p.extract_text(**kw) or "") for p in pages)

    with pdfplumber.open(file) as pdf:
        pages = pdf.pages
        text = extract(pages)
        # Some PDFs encode inter-word spaces as gaps smaller than pdfplumber's
        # default x_tolerance (3), so words come out glued together. Detect that
        # via a very low space ratio and re-extract with a tighter tolerance,
        # keeping it only if it genuinely adds spaces.
        if _space_ratio(text) < 0.08:
            tight = extract(pages, x_tolerance=1)
            if _space_ratio(tight) > _space_ratio(text):
                text = tight
    return text


def _from_docx(file) -> str:
    import docx
    document = docx.Document(file)
    return "\n".join(p.text for p in document.paragraphs)


def _from_txt(file) -> str:
    raw = file.read()
    if isinstance(raw, bytes):
        return raw.decode("utf-8", errors="ignore")
    return raw


def extract_text(file, filename: str | None = None):
    """Return (text, error). Exactly one is non-empty.

    `file` is a file-like object (e.g. a Streamlit UploadedFile).
    """
    name = filename or getattr(file, "name", "") or ""
    ext = os.path.splitext(name)[1].lower()
    try:
        if ext == ".pdf":
            text = _from_pdf(file)
        elif ext == ".docx":
            text = _from_docx(file)
        elif ext == ".txt":
            text = _from_txt(file)
        else:
            return "", f"Unsupported file type: {ext or '(none)'}"
    except ModuleNotFoundError as e:
        return "", (f"Missing library for {ext} files ({e.name}). "
                    f"Install dashboard/requirements.txt.")
    except Exception as e:  # noqa: BLE001 - surface any parse error to the UI
        return "", f"Could not read {name}: {e}"

    text = _clean(text)
    if not text:
        return "", f"No extractable text in {name} (scanned/image PDF?)."
    return text, ""