Spaces:
Sleeping
Sleeping
File size: 4,732 Bytes
b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 b09b8a3 e63c592 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# Optional dependency:
# pip install docling
#
# This script converts local documents (PDF, Markdown, and other formats
# supported by Docling) to text/markdown and uploads them to the backend via
# /documents/upload-text. Docling is used when available; for .txt/.md files,
# the script can fall back to raw text if Docling is not installed.
import argparse
import json
from pathlib import Path
from typing import Any, Dict, Optional
import httpx
try:
from docling.document_converter import DocumentConverter
except ImportError: # pragma: no cover - optional dependency
DocumentConverter = None # type: ignore[assignment]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Convert a local document using Docling (when available) and "
"upload the extracted text to the RAG backend via /documents/upload-text."
)
)
parser.add_argument(
"--file",
"--pdf-path",
"--path",
dest="file_path",
type=str,
required=True,
help="Path to the local file (PDF, Markdown, DOCX, HTML, TXT, etc.).",
)
parser.add_argument(
"--backend-url",
"--backend",
dest="backend_url",
type=str,
default="http://localhost:8000",
help="Base URL of the running backend (default: http://localhost:8000).",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev).",
)
parser.add_argument(
"--title",
type=str,
default=None,
help="Optional title for the document; defaults to the filename.",
)
parser.add_argument(
"--source",
type=str,
default="local-file",
help="Source label stored in metadata (default: local-file).",
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Optional API key for the backend (sent as X-API-Key).",
)
return parser.parse_args()
def _docling_available() -> bool:
return DocumentConverter is not None
def convert_file_to_text(file_path: Path) -> str:
"""Convert a file to markdown/text.
- If Docling is installed, it is used for all supported formats.
- If Docling is not installed:
- .txt and .md files are read as raw text.
- Other formats raise a RuntimeError with installation instructions.
"""
suffix = file_path.suffix.lower()
if _docling_available():
converter = DocumentConverter()
result = converter.convert(str(file_path))
return result.document.export_to_markdown()
# Docling is not available.
if suffix in {".txt", ".md"}:
return file_path.read_text(encoding="utf-8", errors="ignore")
raise RuntimeError(
f"Docling is required to convert '{file_path}'. Install it with:\n"
" pip install docling"
)
def upload_text(
backend_url: str,
title: str,
source: str,
text: str,
namespace: str,
metadata: Optional[Dict[str, Any]] = None,
api_key: Optional[str] = None,
) -> Dict[str, Any]:
url = f"{backend_url.rstrip('/')}/documents/upload-text"
payload = {
"title": title,
"source": source,
"text": text,
"namespace": namespace,
"metadata": metadata or {},
}
headers: Dict[str, str] = {"Content-Type": "application/json"}
if api_key:
headers["X-API-Key"] = api_key
with httpx.Client(timeout=60.0) as client:
response = client.post(url, json=payload, headers=headers)
response.raise_for_status()
return response.json()
def main() -> int:
args = parse_args()
file_path = Path(args.file_path).expanduser().resolve()
if not file_path.is_file():
raise SystemExit(f"File not found: {file_path}")
title = args.title or file_path.name
print(f"Converting file at {file_path}...")
try:
text = convert_file_to_text(file_path)
except Exception as exc: # noqa: BLE001
print(f"Error converting file: {exc}")
return 1
print(
f"Uploading converted text to backend at {args.backend_url} "
f"namespace='{args.namespace}'...",
)
response = upload_text(
backend_url=args.backend_url,
title=title,
source=args.source,
text=text,
namespace=args.namespace,
metadata={"original_path": str(file_path), "extension": file_path.suffix.lower()},
api_key=args.api_key,
)
print("Upload response:")
print(json.dumps(response, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main()) |