Spaces:
Sleeping
Sleeping
| # Optional dependency: | |
| # pip install docling | |
| # | |
| # Batch-ingest a local folder of documents into the backend by converting each | |
| # supported file to markdown/text (using Docling when available) and uploading | |
| # it via /documents/upload-text. | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| from docling_convert_and_upload import convert_file_to_text, upload_text # type: ignore[import] | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser( | |
| description=( | |
| "Recursively ingest a folder of local documents using Docling (when available) " | |
| "and upload them to the backend via /documents/upload-text." | |
| ) | |
| ) | |
| parser.add_argument( | |
| "--folder", | |
| type=str, | |
| required=True, | |
| help="Root folder containing documents to ingest.", | |
| ) | |
| parser.add_argument( | |
| "--backend-url", | |
| "--backend", | |
| dest="backend_url", | |
| type=str, | |
| default="http://localhost:8000", | |
| help="Base URL of the running backend (default: http://localhost:8000).", | |
| ) | |
| parser.add_argument( | |
| "--namespace", | |
| type=str, | |
| default="dev", | |
| help="Target Pinecone namespace (default: dev).", | |
| ) | |
| parser.add_argument( | |
| "--source", | |
| type=str, | |
| default="local-folder", | |
| help="Source label stored in metadata (default: local-folder).", | |
| ) | |
| parser.add_argument( | |
| "--api-key", | |
| type=str, | |
| default=None, | |
| help="Optional API key for the backend (sent as X-API-Key).", | |
| ) | |
| parser.add_argument( | |
| "--max-files", | |
| type=int, | |
| default=200, | |
| help="Maximum number of files to ingest (default: 200).", | |
| ) | |
| return parser.parse_args() | |
| SUPPORTED_EXTENSIONS = { | |
| ".pdf", | |
| ".docx", | |
| ".ppt", | |
| ".pptx", | |
| ".xls", | |
| ".xlsx", | |
| ".html", | |
| ".htm", | |
| ".md", | |
| ".markdown", | |
| ".adoc", | |
| ".txt", | |
| } | |
| def find_files(root: Path, max_files: int) -> List[Path]: | |
| files: List[Path] = [] | |
| for path in root.rglob("*"): | |
| if not path.is_file(): | |
| continue | |
| if path.suffix.lower() not in SUPPORTED_EXTENSIONS: | |
| continue | |
| files.append(path) | |
| if len(files) >= max_files: | |
| break | |
| return files | |
| def main() -> int: | |
| args = parse_args() | |
| root = Path(args.folder).expanduser().resolve() | |
| if not root.is_dir(): | |
| raise SystemExit(f"Folder not found: {root}") | |
| files = find_files(root, args.max_files) | |
| if not files: | |
| print(f"No supported files found in {root}") | |
| return 0 | |
| print(f"Found {len(files)} file(s) to ingest in {root} (max {args.max_files}).") | |
| successes = 0 | |
| failures: List[Dict[str, Any]] = [] | |
| for idx, file_path in enumerate(files, start=1): | |
| print(f"[{idx}/{len(files)}] Converting {file_path}...") | |
| try: | |
| text = convert_file_to_text(file_path) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" Conversion failed: {exc}") | |
| failures.append({"path": str(file_path), "error": str(exc)}) | |
| continue | |
| try: | |
| response = upload_text( | |
| backend_url=args.backend_url, | |
| title=file_path.name, | |
| source=args.source, | |
| text=text, | |
| namespace=args.namespace, | |
| metadata={ | |
| "original_path": str(file_path), | |
| "extension": file_path.suffix.lower(), | |
| }, | |
| api_key=args.api_key, | |
| ) | |
| successes += 1 | |
| print(f" Uploaded successfully: {json.dumps(response, indent=2)}") | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" Upload failed: {exc}") | |
| failures.append({"path": str(file_path), "error": str(exc)}) | |
| print() | |
| print(f"Ingestion complete. Successes: {successes}, Failures: {len(failures)}") | |
| if failures: | |
| print("Failures:") | |
| for item in failures: | |
| print(f"- {item['path']}: {item['error']}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |