rag-agent-workbench-api / scripts /batch_ingest_local_folder.py
BrejBala's picture
final changes with API key
b09b8a3
# Optional dependency:
# pip install docling
#
# Batch-ingest a local folder of documents into the backend by converting each
# supported file to markdown/text (using Docling when available) and uploading
# it via /documents/upload-text.
import argparse
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
from docling_convert_and_upload import convert_file_to_text, upload_text # type: ignore[import]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Recursively ingest a folder of local documents using Docling (when available) "
"and upload them to the backend via /documents/upload-text."
)
)
parser.add_argument(
"--folder",
type=str,
required=True,
help="Root folder containing documents to ingest.",
)
parser.add_argument(
"--backend-url",
"--backend",
dest="backend_url",
type=str,
default="http://localhost:8000",
help="Base URL of the running backend (default: http://localhost:8000).",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev).",
)
parser.add_argument(
"--source",
type=str,
default="local-folder",
help="Source label stored in metadata (default: local-folder).",
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Optional API key for the backend (sent as X-API-Key).",
)
parser.add_argument(
"--max-files",
type=int,
default=200,
help="Maximum number of files to ingest (default: 200).",
)
return parser.parse_args()
SUPPORTED_EXTENSIONS = {
".pdf",
".docx",
".ppt",
".pptx",
".xls",
".xlsx",
".html",
".htm",
".md",
".markdown",
".adoc",
".txt",
}
def find_files(root: Path, max_files: int) -> List[Path]:
files: List[Path] = []
for path in root.rglob("*"):
if not path.is_file():
continue
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
continue
files.append(path)
if len(files) >= max_files:
break
return files
def main() -> int:
args = parse_args()
root = Path(args.folder).expanduser().resolve()
if not root.is_dir():
raise SystemExit(f"Folder not found: {root}")
files = find_files(root, args.max_files)
if not files:
print(f"No supported files found in {root}")
return 0
print(f"Found {len(files)} file(s) to ingest in {root} (max {args.max_files}).")
successes = 0
failures: List[Dict[str, Any]] = []
for idx, file_path in enumerate(files, start=1):
print(f"[{idx}/{len(files)}] Converting {file_path}...")
try:
text = convert_file_to_text(file_path)
except Exception as exc: # noqa: BLE001
print(f" Conversion failed: {exc}")
failures.append({"path": str(file_path), "error": str(exc)})
continue
try:
response = upload_text(
backend_url=args.backend_url,
title=file_path.name,
source=args.source,
text=text,
namespace=args.namespace,
metadata={
"original_path": str(file_path),
"extension": file_path.suffix.lower(),
},
api_key=args.api_key,
)
successes += 1
print(f" Uploaded successfully: {json.dumps(response, indent=2)}")
except Exception as exc: # noqa: BLE001
print(f" Upload failed: {exc}")
failures.append({"path": str(file_path), "error": str(exc)})
print()
print(f"Ingestion complete. Successes: {successes}, Failures: {len(failures)}")
if failures:
print("Failures:")
for item in failures:
print(f"- {item['path']}: {item['error']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())