Spaces:
Running
Running
File size: 3,263 Bytes
31a2688 5ab78ea 31a2688 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | """Standalone ingestion script: parse PDFs, chunk, embed, and persist to Qdrant.
Usage:
python -m scripts.ingest [--docs-dir DOCS_DIR] [--strategy recursive]
"""
import argparse
import logging
import sys
from pathlib import Path
# Ensure project root is on sys.path so `src.*` imports resolve.
_PROJECT_ROOT = str(Path(__file__).resolve().parent.parent)
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from src.config import load_settings
from src.models import ChunkStrategy
from src.provider import create_embeddings
from src.ingestion.pipeline import IngestionPipeline
from src.retrieval.embedder import Embedder
from src.retrieval.vector_store import VectorStore
logger = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments.
Returns:
Parsed argument namespace.
"""
parser = argparse.ArgumentParser(
description="Ingest PDF documents into the Qdrant vector store.",
)
parser.add_argument(
"--docs-dir",
type=str,
default=None,
help="Path to the directory containing PDFs (default: <project_root>/docs).",
)
parser.add_argument(
"--strategy",
type=str,
choices=[s.value for s in ChunkStrategy],
default=None,
help="Chunking strategy (default: from CHUNK_STRATEGY env or 'recursive').",
)
return parser.parse_args()
def main() -> None:
"""Run the full ingestion pipeline and persist results to Qdrant."""
args = parse_args()
settings = load_settings()
logging.basicConfig(
level=getattr(logging, settings.log_level.upper(), logging.INFO),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
docs_dir = args.docs_dir or str(Path(_PROJECT_ROOT) / "docs")
strategy_value = args.strategy or "recursive"
strategy = ChunkStrategy(strategy_value)
logger.info("=== Doc Assistant — Ingestion ===")
logger.info("Docs directory : %s", docs_dir)
logger.info("Chunk strategy : %s", strategy.value)
logger.info("Chunk size : %d", settings.chunk_size)
logger.info("Chunk overlap : %d", settings.chunk_overlap)
logger.info("Qdrant path : %s", settings.qdrant_path)
# 1. Parse and chunk PDFs
pipeline = IngestionPipeline(
strategy=strategy,
chunk_size=settings.chunk_size,
chunk_overlap=settings.chunk_overlap,
)
chunks = pipeline.ingest_directory(docs_dir)
if not chunks:
logger.warning("No chunks produced — nothing to ingest.")
return
# 2. Embed chunks
embeddings_model = create_embeddings(settings)
embedder = Embedder(embeddings_model)
texts = [chunk.text for chunk in chunks]
logger.info("Embedding %d chunks ...", len(texts))
vectors = embedder.embed_batch(texts)
# 3. Store in Qdrant (persistent path from config)
store = VectorStore(
path=settings.qdrant_path,
collection_name=settings.collection_name,
dimension=settings.embedding_dimension,
url=settings.qdrant_url,
)
store.add_chunks(chunks, vectors)
logger.info("=== Ingestion complete: %d chunks indexed ===", len(chunks))
if __name__ == "__main__":
main()
|