File size: 2,265 Bytes
b09b8a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import annotations

from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Dict, Tuple

try:
    from docling.document_converter import DocumentConverter
except ImportError:  # pragma: no cover - optional dependency
    DocumentConverter = None  # type: ignore[assignment]


def convert_uploaded_file_to_text(uploaded_file) -> Tuple[str, Dict[str, Any]]:
    """Convert an uploaded Streamlit file to text/markdown.

    - For .txt and .md, returns raw UTF-8 text.
    - For other supported formats (PDF/Office/HTML), uses Docling when installed.
    - Raises a RuntimeError with a user-friendly message when Docling is required
      but not installed.
    """
    filename = uploaded_file.name
    ext = Path(filename).suffix.lower().lstrip(".")
    size_bytes = getattr(uploaded_file, "size", None)
    content_type = getattr(uploaded_file, "type", None)

    metadata: Dict[str, Any] = {
        "filename": filename,
        "ext": ext,
        "size_bytes": size_bytes,
        "content_type": content_type,
    }

    # Plain text / markdown: read directly.
    if ext in {"txt", "md"}:
        raw_bytes = uploaded_file.read()
        text = raw_bytes.decode("utf-8", errors="ignore")
        metadata["converted_by"] = "raw"
        return text, metadata

    # Rich formats: require Docling.
    if DocumentConverter is None:
        raise RuntimeError(
            "Docling is not installed; conversion for this file type is unavailable. "
            "Install docling (e.g. `pip install docling`) or upload a .md/.txt file."
        )

    # Persist to a temporary file so Docling can read it from disk.
    with NamedTemporaryFile(delete=True, suffix=f".{ext}") as tmp:
        # Streamlit's UploadedFile exposes getbuffer() for zero-copy writes.
        tmp.write(uploaded_file.getbuffer())
        tmp.flush()

        converter = DocumentConverter()
        result = converter.convert(tmp.name)

        try:
            text = result.document.export_to_markdown()
        except Exception:  # noqa: BLE001
            # Fallback to plain text if markdown export is not available.
            text = result.document.export_to_text()

    metadata["converted_by"] = "docling"
    return text, metadata