rag-agent-workbench-api / scripts /docling_convert_and_upload.py
BrejBala's picture
final changes with API key
b09b8a3
# Optional dependency:
# pip install docling
#
# This script converts local documents (PDF, Markdown, and other formats
# supported by Docling) to text/markdown and uploads them to the backend via
# /documents/upload-text. Docling is used when available; for .txt/.md files,
# the script can fall back to raw text if Docling is not installed.
import argparse
import json
from pathlib import Path
from typing import Any, Dict, Optional
import httpx
try:
from docling.document_converter import DocumentConverter
except ImportError: # pragma: no cover - optional dependency
DocumentConverter = None # type: ignore[assignment]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Convert a local document using Docling (when available) and "
"upload the extracted text to the RAG backend via /documents/upload-text."
)
)
parser.add_argument(
"--file",
"--pdf-path",
"--path",
dest="file_path",
type=str,
required=True,
help="Path to the local file (PDF, Markdown, DOCX, HTML, TXT, etc.).",
)
parser.add_argument(
"--backend-url",
"--backend",
dest="backend_url",
type=str,
default="http://localhost:8000",
help="Base URL of the running backend (default: http://localhost:8000).",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev).",
)
parser.add_argument(
"--title",
type=str,
default=None,
help="Optional title for the document; defaults to the filename.",
)
parser.add_argument(
"--source",
type=str,
default="local-file",
help="Source label stored in metadata (default: local-file).",
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Optional API key for the backend (sent as X-API-Key).",
)
return parser.parse_args()
def _docling_available() -> bool:
return DocumentConverter is not None
def convert_file_to_text(file_path: Path) -> str:
"""Convert a file to markdown/text.
- If Docling is installed, it is used for all supported formats.
- If Docling is not installed:
- .txt and .md files are read as raw text.
- Other formats raise a RuntimeError with installation instructions.
"""
suffix = file_path.suffix.lower()
if _docling_available():
converter = DocumentConverter()
result = converter.convert(str(file_path))
return result.document.export_to_markdown()
# Docling is not available.
if suffix in {".txt", ".md"}:
return file_path.read_text(encoding="utf-8", errors="ignore")
raise RuntimeError(
f"Docling is required to convert '{file_path}'. Install it with:\n"
" pip install docling"
)
def upload_text(
backend_url: str,
title: str,
source: str,
text: str,
namespace: str,
metadata: Optional[Dict[str, Any]] = None,
api_key: Optional[str] = None,
) -> Dict[str, Any]:
url = f"{backend_url.rstrip('/')}/documents/upload-text"
payload = {
"title": title,
"source": source,
"text": text,
"namespace": namespace,
"metadata": metadata or {},
}
headers: Dict[str, str] = {"Content-Type": "application/json"}
if api_key:
headers["X-API-Key"] = api_key
with httpx.Client(timeout=60.0) as client:
response = client.post(url, json=payload, headers=headers)
response.raise_for_status()
return response.json()
def main() -> int:
args = parse_args()
file_path = Path(args.file_path).expanduser().resolve()
if not file_path.is_file():
raise SystemExit(f"File not found: {file_path}")
title = args.title or file_path.name
print(f"Converting file at {file_path}...")
try:
text = convert_file_to_text(file_path)
except Exception as exc: # noqa: BLE001
print(f"Error converting file: {exc}")
return 1
print(
f"Uploading converted text to backend at {args.backend_url} "
f"namespace='{args.namespace}'...",
)
response = upload_text(
backend_url=args.backend_url,
title=title,
source=args.source,
text=text,
namespace=args.namespace,
metadata={"original_path": str(file_path), "extension": file_path.suffix.lower()},
api_key=args.api_key,
)
print("Upload response:")
print(json.dumps(response, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())