BrejBala's picture
final changes with API key
b09b8a3
from __future__ import annotations
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Dict, Tuple
try:
from docling.document_converter import DocumentConverter
except ImportError: # pragma: no cover - optional dependency
DocumentConverter = None # type: ignore[assignment]
def convert_uploaded_file_to_text(uploaded_file) -> Tuple[str, Dict[str, Any]]:
"""Convert an uploaded Streamlit file to text/markdown.
- For .txt and .md, returns raw UTF-8 text.
- For other supported formats (PDF/Office/HTML), uses Docling when installed.
- Raises a RuntimeError with a user-friendly message when Docling is required
but not installed.
"""
filename = uploaded_file.name
ext = Path(filename).suffix.lower().lstrip(".")
size_bytes = getattr(uploaded_file, "size", None)
content_type = getattr(uploaded_file, "type", None)
metadata: Dict[str, Any] = {
"filename": filename,
"ext": ext,
"size_bytes": size_bytes,
"content_type": content_type,
}
# Plain text / markdown: read directly.
if ext in {"txt", "md"}:
raw_bytes = uploaded_file.read()
text = raw_bytes.decode("utf-8", errors="ignore")
metadata["converted_by"] = "raw"
return text, metadata
# Rich formats: require Docling.
if DocumentConverter is None:
raise RuntimeError(
"Docling is not installed; conversion for this file type is unavailable. "
"Install docling (e.g. `pip install docling`) or upload a .md/.txt file."
)
# Persist to a temporary file so Docling can read it from disk.
with NamedTemporaryFile(delete=True, suffix=f".{ext}") as tmp:
# Streamlit's UploadedFile exposes getbuffer() for zero-copy writes.
tmp.write(uploaded_file.getbuffer())
tmp.flush()
converter = DocumentConverter()
result = converter.convert(tmp.name)
try:
text = result.document.export_to_markdown()
except Exception: # noqa: BLE001
# Fallback to plain text if markdown export is not available.
text = result.document.export_to_text()
metadata["converted_by"] = "docling"
return text, metadata