Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from pathlib import Path | |
| from tempfile import NamedTemporaryFile | |
| from typing import Any, Dict, Tuple | |
| try: | |
| from docling.document_converter import DocumentConverter | |
| except ImportError: # pragma: no cover - optional dependency | |
| DocumentConverter = None # type: ignore[assignment] | |
| def convert_uploaded_file_to_text(uploaded_file) -> Tuple[str, Dict[str, Any]]: | |
| """Convert an uploaded Streamlit file to text/markdown. | |
| - For .txt and .md, returns raw UTF-8 text. | |
| - For other supported formats (PDF/Office/HTML), uses Docling when installed. | |
| - Raises a RuntimeError with a user-friendly message when Docling is required | |
| but not installed. | |
| """ | |
| filename = uploaded_file.name | |
| ext = Path(filename).suffix.lower().lstrip(".") | |
| size_bytes = getattr(uploaded_file, "size", None) | |
| content_type = getattr(uploaded_file, "type", None) | |
| metadata: Dict[str, Any] = { | |
| "filename": filename, | |
| "ext": ext, | |
| "size_bytes": size_bytes, | |
| "content_type": content_type, | |
| } | |
| # Plain text / markdown: read directly. | |
| if ext in {"txt", "md"}: | |
| raw_bytes = uploaded_file.read() | |
| text = raw_bytes.decode("utf-8", errors="ignore") | |
| metadata["converted_by"] = "raw" | |
| return text, metadata | |
| # Rich formats: require Docling. | |
| if DocumentConverter is None: | |
| raise RuntimeError( | |
| "Docling is not installed; conversion for this file type is unavailable. " | |
| "Install docling (e.g. `pip install docling`) or upload a .md/.txt file." | |
| ) | |
| # Persist to a temporary file so Docling can read it from disk. | |
| with NamedTemporaryFile(delete=True, suffix=f".{ext}") as tmp: | |
| # Streamlit's UploadedFile exposes getbuffer() for zero-copy writes. | |
| tmp.write(uploaded_file.getbuffer()) | |
| tmp.flush() | |
| converter = DocumentConverter() | |
| result = converter.convert(tmp.name) | |
| try: | |
| text = result.document.export_to_markdown() | |
| except Exception: # noqa: BLE001 | |
| # Fallback to plain text if markdown export is not available. | |
| text = result.document.export_to_text() | |
| metadata["converted_by"] = "docling" | |
| return text, metadata |