Spaces:
Sleeping
Sleeping
File size: 4,183 Bytes
b09b8a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# Optional dependency:
# pip install docling
#
# Batch-ingest a local folder of documents into the backend by converting each
# supported file to markdown/text (using Docling when available) and uploading
# it via /documents/upload-text.
import argparse
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
from docling_convert_and_upload import convert_file_to_text, upload_text # type: ignore[import]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Recursively ingest a folder of local documents using Docling (when available) "
"and upload them to the backend via /documents/upload-text."
)
)
parser.add_argument(
"--folder",
type=str,
required=True,
help="Root folder containing documents to ingest.",
)
parser.add_argument(
"--backend-url",
"--backend",
dest="backend_url",
type=str,
default="http://localhost:8000",
help="Base URL of the running backend (default: http://localhost:8000).",
)
parser.add_argument(
"--namespace",
type=str,
default="dev",
help="Target Pinecone namespace (default: dev).",
)
parser.add_argument(
"--source",
type=str,
default="local-folder",
help="Source label stored in metadata (default: local-folder).",
)
parser.add_argument(
"--api-key",
type=str,
default=None,
help="Optional API key for the backend (sent as X-API-Key).",
)
parser.add_argument(
"--max-files",
type=int,
default=200,
help="Maximum number of files to ingest (default: 200).",
)
return parser.parse_args()
SUPPORTED_EXTENSIONS = {
".pdf",
".docx",
".ppt",
".pptx",
".xls",
".xlsx",
".html",
".htm",
".md",
".markdown",
".adoc",
".txt",
}
def find_files(root: Path, max_files: int) -> List[Path]:
files: List[Path] = []
for path in root.rglob("*"):
if not path.is_file():
continue
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
continue
files.append(path)
if len(files) >= max_files:
break
return files
def main() -> int:
args = parse_args()
root = Path(args.folder).expanduser().resolve()
if not root.is_dir():
raise SystemExit(f"Folder not found: {root}")
files = find_files(root, args.max_files)
if not files:
print(f"No supported files found in {root}")
return 0
print(f"Found {len(files)} file(s) to ingest in {root} (max {args.max_files}).")
successes = 0
failures: List[Dict[str, Any]] = []
for idx, file_path in enumerate(files, start=1):
print(f"[{idx}/{len(files)}] Converting {file_path}...")
try:
text = convert_file_to_text(file_path)
except Exception as exc: # noqa: BLE001
print(f" Conversion failed: {exc}")
failures.append({"path": str(file_path), "error": str(exc)})
continue
try:
response = upload_text(
backend_url=args.backend_url,
title=file_path.name,
source=args.source,
text=text,
namespace=args.namespace,
metadata={
"original_path": str(file_path),
"extension": file_path.suffix.lower(),
},
api_key=args.api_key,
)
successes += 1
print(f" Uploaded successfully: {json.dumps(response, indent=2)}")
except Exception as exc: # noqa: BLE001
print(f" Upload failed: {exc}")
failures.append({"path": str(file_path), "error": str(exc)})
print()
print(f"Ingestion complete. Successes: {successes}, Failures: {len(failures)}")
if failures:
print("Failures:")
for item in failures:
print(f"- {item['path']}: {item['error']}")
return 0
if __name__ == "__main__":
raise SystemExit(main()) |