Spaces:
Sleeping
Sleeping
| # Optional dependency: | |
| # pip install docling | |
| # | |
| # This script converts local documents (PDF, Markdown, and other formats | |
| # supported by Docling) to text/markdown and uploads them to the backend via | |
| # /documents/upload-text. Docling is used when available; for .txt/.md files, | |
| # the script can fall back to raw text if Docling is not installed. | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import Any, Dict, Optional | |
| import httpx | |
| try: | |
| from docling.document_converter import DocumentConverter | |
| except ImportError: # pragma: no cover - optional dependency | |
| DocumentConverter = None # type: ignore[assignment] | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description=( | |
| "Convert a local document using Docling (when available) and " | |
| "upload the extracted text to the RAG backend via /documents/upload-text." | |
| ) | |
| ) | |
| parser.add_argument( | |
| "--file", | |
| "--pdf-path", | |
| "--path", | |
| dest="file_path", | |
| type=str, | |
| required=True, | |
| help="Path to the local file (PDF, Markdown, DOCX, HTML, TXT, etc.).", | |
| ) | |
| parser.add_argument( | |
| "--backend-url", | |
| "--backend", | |
| dest="backend_url", | |
| type=str, | |
| default="http://localhost:8000", | |
| help="Base URL of the running backend (default: http://localhost:8000).", | |
| ) | |
| parser.add_argument( | |
| "--namespace", | |
| type=str, | |
| default="dev", | |
| help="Target Pinecone namespace (default: dev).", | |
| ) | |
| parser.add_argument( | |
| "--title", | |
| type=str, | |
| default=None, | |
| help="Optional title for the document; defaults to the filename.", | |
| ) | |
| parser.add_argument( | |
| "--source", | |
| type=str, | |
| default="local-file", | |
| help="Source label stored in metadata (default: local-file).", | |
| ) | |
| parser.add_argument( | |
| "--api-key", | |
| type=str, | |
| default=None, | |
| help="Optional API key for the backend (sent as X-API-Key).", | |
| ) | |
| return parser.parse_args() | |
| def _docling_available() -> bool: | |
| return DocumentConverter is not None | |
| def convert_file_to_text(file_path: Path) -> str: | |
| """Convert a file to markdown/text. | |
| - If Docling is installed, it is used for all supported formats. | |
| - If Docling is not installed: | |
| - .txt and .md files are read as raw text. | |
| - Other formats raise a RuntimeError with installation instructions. | |
| """ | |
| suffix = file_path.suffix.lower() | |
| if _docling_available(): | |
| converter = DocumentConverter() | |
| result = converter.convert(str(file_path)) | |
| return result.document.export_to_markdown() | |
| # Docling is not available. | |
| if suffix in {".txt", ".md"}: | |
| return file_path.read_text(encoding="utf-8", errors="ignore") | |
| raise RuntimeError( | |
| f"Docling is required to convert '{file_path}'. Install it with:\n" | |
| " pip install docling" | |
| ) | |
| def upload_text( | |
| backend_url: str, | |
| title: str, | |
| source: str, | |
| text: str, | |
| namespace: str, | |
| metadata: Optional[Dict[str, Any]] = None, | |
| api_key: Optional[str] = None, | |
| ) -> Dict[str, Any]: | |
| url = f"{backend_url.rstrip('/')}/documents/upload-text" | |
| payload = { | |
| "title": title, | |
| "source": source, | |
| "text": text, | |
| "namespace": namespace, | |
| "metadata": metadata or {}, | |
| } | |
| headers: Dict[str, str] = {"Content-Type": "application/json"} | |
| if api_key: | |
| headers["X-API-Key"] = api_key | |
| with httpx.Client(timeout=60.0) as client: | |
| response = client.post(url, json=payload, headers=headers) | |
| response.raise_for_status() | |
| return response.json() | |
| def main() -> int: | |
| args = parse_args() | |
| file_path = Path(args.file_path).expanduser().resolve() | |
| if not file_path.is_file(): | |
| raise SystemExit(f"File not found: {file_path}") | |
| title = args.title or file_path.name | |
| print(f"Converting file at {file_path}...") | |
| try: | |
| text = convert_file_to_text(file_path) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f"Error converting file: {exc}") | |
| return 1 | |
| print( | |
| f"Uploading converted text to backend at {args.backend_url} " | |
| f"namespace='{args.namespace}'...", | |
| ) | |
| response = upload_text( | |
| backend_url=args.backend_url, | |
| title=title, | |
| source=args.source, | |
| text=text, | |
| namespace=args.namespace, | |
| metadata={"original_path": str(file_path), "extension": file_path.suffix.lower()}, | |
| api_key=args.api_key, | |
| ) | |
| print("Upload response:") | |
| print(json.dumps(response, indent=2)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |