File size: 2,937 Bytes
6686f13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Development helper to validate Docling temp file handling outside Streamlit.
#
# Usage:
#   python scripts/dev_test_docling_temp.py --file path/to/document.pdf
#
# This script uses the same temp-directory pattern as the frontend's
# `convert_uploaded_file_to_text` to exercise Docling on Windows and Linux.

from __future__ import annotations

import argparse
import os
import shutil
import tempfile
import time
from pathlib import Path

try:
    from docling.document_converter import DocumentConverter
except ImportError:
    raise SystemExit(
        "Docling is not installed. Install it with:\n"
        "  pip install docling"
    )


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Dev test for Docling conversion using a temp directory."
    )
    parser.add_argument(
        "--file",
        required=True,
        type=str,
        help="Path to a document (PDF/Office/HTML) to convert.",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    src_path = Path(args.file).expanduser().resolve()
    if not src_path.is_file():
        print(f"File not found: {src_path}")
        return 1

    tmp_dir = tempfile.mkdtemp(prefix="rag_dev_docling_")
    suffix = src_path.suffix or ".bin"
    tmp_file = os.path.join(tmp_dir, f"upload{suffix}")

    try:
        # Copy to temp directory
        with open(src_path, "rb") as f_in, open(tmp_file, "wb") as f_out:
            f_out.write(f_in.read())

        converter = DocumentConverter()

        last_exc: Exception | None = None
        for attempt in range(2):
            try:
                result = converter.convert(tmp_file)
                doc = result.document
                try:
                    text = doc.export_to_markdown()
                except Exception:  # noqa: BLE001
                    text = ""
                if not text:
                    text = doc.export_to_text()
                print("Conversion succeeded.")
                print("First 500 characters:")
                print("-" * 80)
                print(text[:500])
                print("-" * 80)
                return 0
            except PermissionError as exc:
                last_exc = exc
                if attempt == 0:
                    print("PermissionError detected; retrying after brief sleep...")
                    time.sleep(0.2)
                    continue
                print("PermissionError persists after retry:")
                raise
        if last_exc is not None:
            raise last_exc
    finally:
        # Cleanup
        for _ in range(2):
            try:
                if os.path.exists(tmp_file):
                    os.remove(tmp_file)
                break
            except PermissionError:
                time.sleep(0.2)
        shutil.rmtree(tmp_dir, ignore_errors=True)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())