rag-agent-workbench-api / scripts /dev_test_docling_temp.py
BrejBala's picture
feat: deploy Tiers 2 & 3 — CRAG, faithfulness, streaming, Prometheus, eval-driven retrieval
6686f13
Raw
History Blame Contribute Delete
2.94 kB
# Development helper to validate Docling temp file handling outside Streamlit.
#
# Usage:
# python scripts/dev_test_docling_temp.py --file path/to/document.pdf
#
# This script uses the same temp-directory pattern as the frontend's
# `convert_uploaded_file_to_text` to exercise Docling on Windows and Linux.
from __future__ import annotations
import argparse
import os
import shutil
import tempfile
import time
from pathlib import Path
try:
from docling.document_converter import DocumentConverter
except ImportError:
raise SystemExit(
"Docling is not installed. Install it with:\n"
" pip install docling"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Dev test for Docling conversion using a temp directory."
)
parser.add_argument(
"--file",
required=True,
type=str,
help="Path to a document (PDF/Office/HTML) to convert.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
src_path = Path(args.file).expanduser().resolve()
if not src_path.is_file():
print(f"File not found: {src_path}")
return 1
tmp_dir = tempfile.mkdtemp(prefix="rag_dev_docling_")
suffix = src_path.suffix or ".bin"
tmp_file = os.path.join(tmp_dir, f"upload{suffix}")
try:
# Copy to temp directory
with open(src_path, "rb") as f_in, open(tmp_file, "wb") as f_out:
f_out.write(f_in.read())
converter = DocumentConverter()
last_exc: Exception | None = None
for attempt in range(2):
try:
result = converter.convert(tmp_file)
doc = result.document
try:
text = doc.export_to_markdown()
except Exception: # noqa: BLE001
text = ""
if not text:
text = doc.export_to_text()
print("Conversion succeeded.")
print("First 500 characters:")
print("-" * 80)
print(text[:500])
print("-" * 80)
return 0
except PermissionError as exc:
last_exc = exc
if attempt == 0:
print("PermissionError detected; retrying after brief sleep...")
time.sleep(0.2)
continue
print("PermissionError persists after retry:")
raise
if last_exc is not None:
raise last_exc
finally:
# Cleanup
for _ in range(2):
try:
if os.path.exists(tmp_file):
os.remove(tmp_file)
break
except PermissionError:
time.sleep(0.2)
shutil.rmtree(tmp_dir, ignore_errors=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())