File size: 4,732 Bytes
b09b8a3
 
 
 
 
 
 
e63c592
 
 
b09b8a3
 
e63c592
 
b09b8a3
 
 
 
 
e63c592
 
 
 
 
b09b8a3
 
e63c592
 
 
b09b8a3
e63c592
b09b8a3
 
e63c592
 
b09b8a3
e63c592
 
 
b09b8a3
 
e63c592
 
 
 
 
 
 
 
 
 
 
 
 
 
b09b8a3
e63c592
 
 
 
b09b8a3
 
 
 
 
 
 
 
e63c592
 
 
 
b09b8a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e63c592
 
 
 
 
 
 
 
b09b8a3
 
e63c592
 
 
 
 
 
 
 
 
b09b8a3
 
 
 
e63c592
b09b8a3
e63c592
 
 
 
 
 
b09b8a3
 
 
 
 
e63c592
b09b8a3
 
 
 
 
 
e63c592
 
 
 
 
 
 
 
 
b09b8a3
e63c592
b09b8a3
 
e63c592
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Optional dependency:
#   pip install docling
#
# This script converts local documents (PDF, Markdown, and other formats
# supported by Docling) to text/markdown and uploads them to the backend via
# /documents/upload-text. Docling is used when available; for .txt/.md files,
# the script can fall back to raw text if Docling is not installed.

import argparse
import json
from pathlib import Path
from typing import Any, Dict, Optional

import httpx

try:
    from docling.document_converter import DocumentConverter
except ImportError:  # pragma: no cover - optional dependency
    DocumentConverter = None  # type: ignore[assignment]


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Convert a local document using Docling (when available) and "
            "upload the extracted text to the RAG backend via /documents/upload-text."
        )
    )
    parser.add_argument(
        "--file",
        "--pdf-path",
        "--path",
        dest="file_path",
        type=str,
        required=True,
        help="Path to the local file (PDF, Markdown, DOCX, HTML, TXT, etc.).",
    )
    parser.add_argument(
        "--backend-url",
        "--backend",
        dest="backend_url",
        type=str,
        default="http://localhost:8000",
        help="Base URL of the running backend (default: http://localhost:8000).",
    )
    parser.add_argument(
        "--namespace",
        type=str,
        default="dev",
        help="Target Pinecone namespace (default: dev).",
    )
    parser.add_argument(
        "--title",
        type=str,
        default=None,
        help="Optional title for the document; defaults to the filename.",
    )
    parser.add_argument(
        "--source",
        type=str,
        default="local-file",
        help="Source label stored in metadata (default: local-file).",
    )
    parser.add_argument(
        "--api-key",
        type=str,
        default=None,
        help="Optional API key for the backend (sent as X-API-Key).",
    )
    return parser.parse_args()


def _docling_available() -> bool:
    return DocumentConverter is not None


def convert_file_to_text(file_path: Path) -> str:
    """Convert a file to markdown/text.

    - If Docling is installed, it is used for all supported formats.
    - If Docling is not installed:
      - .txt and .md files are read as raw text.
      - Other formats raise a RuntimeError with installation instructions.
    """
    suffix = file_path.suffix.lower()

    if _docling_available():
        converter = DocumentConverter()
        result = converter.convert(str(file_path))
        return result.document.export_to_markdown()

    # Docling is not available.
    if suffix in {".txt", ".md"}:
        return file_path.read_text(encoding="utf-8", errors="ignore")

    raise RuntimeError(
        f"Docling is required to convert '{file_path}'. Install it with:\n"
        "  pip install docling"
    )


def upload_text(
    backend_url: str,
    title: str,
    source: str,
    text: str,
    namespace: str,
    metadata: Optional[Dict[str, Any]] = None,
    api_key: Optional[str] = None,
) -> Dict[str, Any]:
    url = f"{backend_url.rstrip('/')}/documents/upload-text"
    payload = {
        "title": title,
        "source": source,
        "text": text,
        "namespace": namespace,
        "metadata": metadata or {},
    }
    headers: Dict[str, str] = {"Content-Type": "application/json"}
    if api_key:
        headers["X-API-Key"] = api_key

    with httpx.Client(timeout=60.0) as client:
        response = client.post(url, json=payload, headers=headers)
        response.raise_for_status()
        return response.json()


def main() -> int:
    args = parse_args()
    file_path = Path(args.file_path).expanduser().resolve()
    if not file_path.is_file():
        raise SystemExit(f"File not found: {file_path}")

    title = args.title or file_path.name

    print(f"Converting file at {file_path}...")
    try:
        text = convert_file_to_text(file_path)
    except Exception as exc:  # noqa: BLE001
        print(f"Error converting file: {exc}")
        return 1

    print(
        f"Uploading converted text to backend at {args.backend_url} "
        f"namespace='{args.namespace}'...",
    )
    response = upload_text(
        backend_url=args.backend_url,
        title=title,
        source=args.source,
        text=text,
        namespace=args.namespace,
        metadata={"original_path": str(file_path), "extension": file_path.suffix.lower()},
        api_key=args.api_key,
    )

    print("Upload response:")
    print(json.dumps(response, indent=2))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())