Spaces:

DenysKovalML
/

scientific-rag

Sleeping

App Files Files Community

DenysKovalML commited on Dec 14, 2025

Commit

8ddd8e2

1 Parent(s): 6c023b4

fix: add batching to qdrant index

Browse files

Files changed (2) hide show

src/scientific_rag/cli.py +4 -0
src/scientific_rag/scripts/index_qdrant.py +53 -25

src/scientific_rag/cli.py CHANGED Viewed

@@ -24,6 +24,7 @@ def index(
     embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
     upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
     create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
 ) -> None:
     """Embed chunks and upload to Qdrant."""
     chunks_path = Path(chunks_file) if chunks_file else None
@@ -32,6 +33,7 @@ def index(
         embedding_batch_size=embedding_batch_size,
         upload_batch_size=upload_batch_size,
         create_collection=create_collection,
     )
@@ -41,6 +43,7 @@ def pipeline(
     embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
     upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
     create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
 ) -> None:
     """Run complete pipeline: chunk → embed → index."""
     logger.info("Step 1/2: Chunking data")
@@ -52,6 +55,7 @@ def pipeline(
         embedding_batch_size=embedding_batch_size,
         upload_batch_size=upload_batch_size,
         create_collection=create_collection,
     )

     embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
     upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
     create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
+    process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"),
 ) -> None:
     """Embed chunks and upload to Qdrant."""
     chunks_path = Path(chunks_file) if chunks_file else None
         embedding_batch_size=embedding_batch_size,
         upload_batch_size=upload_batch_size,
         create_collection=create_collection,
+        process_batch_size=process_batch_size,
     )
     embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
     upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
     create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
+    process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"),
 ) -> None:
     """Run complete pipeline: chunk → embed → index."""
     logger.info("Step 1/2: Chunking data")
         embedding_batch_size=embedding_batch_size,
         upload_batch_size=upload_batch_size,
         create_collection=create_collection,
+        process_batch_size=process_batch_size,
     )

src/scientific_rag/scripts/index_qdrant.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 from pathlib import Path
@@ -10,17 +11,21 @@ from scientific_rag.infrastructure.qdrant import QdrantService
 from scientific_rag.settings import settings
-def load_chunks(chunks_file: Path) -> list[PaperChunk]:
-    """Load chunks from JSON file."""
-    logger.info(f"Loading chunks from {chunks_file}")
     with open(chunks_file, encoding="utf-8") as f:
         chunks_data = json.load(f)
-    chunks = [PaperChunk(**chunk_data) for chunk_data in chunks_data]
-    logger.info(f"Loaded {len(chunks)} chunks")
-    return chunks
 def embed_chunks(
@@ -51,18 +56,17 @@ def index_chunks_to_qdrant(
     chunks: list[PaperChunk],
     qdrant_service: QdrantService,
     batch_size: int = 100,
 ) -> int:
     """Upload chunks to Qdrant in batches."""
-    logger.info(f"Indexing {len(chunks)} chunks to Qdrant")
     total_uploaded = 0
-    for i in tqdm(range(0, len(chunks), batch_size), desc="Uploading to Qdrant"):
         batch = chunks[i : i + batch_size]
         uploaded = qdrant_service.upsert_chunks(batch)
         total_uploaded += uploaded
-    logger.success(f"Indexed {total_uploaded} chunks to Qdrant")
     return total_uploaded
@@ -71,8 +75,17 @@ def index_qdrant(
     embedding_batch_size: int = 32,
     upload_batch_size: int = 100,
     create_collection: bool = True,
 ) -> dict[str, int]:
-    """Complete pipeline to index chunks to Qdrant."""
     if chunks_file is None:
         chunks_file = Path(settings.root_dir) / "data" / "processed" / f"chunks_{settings.dataset_split}.json"
     else:
@@ -85,27 +98,42 @@ def index_qdrant(
     if create_collection:
         qdrant_service.create_collection(vector_size=encoder.embedding_dim)
-    chunks = load_chunks(chunks_file)
-    chunks = embed_chunks(
-        chunks=chunks,
-        batch_size=embedding_batch_size,
-        show_progress=True,
-    )
-    total_uploaded = index_chunks_to_qdrant(
-        chunks=chunks,
-        qdrant_service=qdrant_service,
-        batch_size=upload_batch_size,
-    )
     collection_info = qdrant_service.get_collection_info()
     stats = {
-        "chunks_loaded": len(chunks),
         "chunks_uploaded": total_uploaded,
         "collection_points": collection_info.get("points_count", 0),
         "collection_vectors": collection_info.get("index_vectors_count", 0),
     }
-    logger.info(f"Indexing complete: {stats}")
     return stats

+from collections.abc import Iterator
 import json
 from pathlib import Path
 from scientific_rag.settings import settings
+def load_chunks_generator(chunks_file: Path, batch_size: int = 10000) -> Iterator[list[PaperChunk]]:
+    logger.info(f"Loading chunks from {chunks_file} in batches of {batch_size}")
     with open(chunks_file, encoding="utf-8") as f:
         chunks_data = json.load(f)
+    total_chunks = len(chunks_data)
+    logger.info(f"Found {total_chunks} chunks in file")
+    for i in range(0, total_chunks, batch_size):
+        batch_data = chunks_data[i : i + batch_size]
+        batch_chunks = [PaperChunk(**chunk_data) for chunk_data in batch_data]
+        yield batch_chunks
+    del chunks_data
 def embed_chunks(
     chunks: list[PaperChunk],
     qdrant_service: QdrantService,
     batch_size: int = 100,
+    show_progress: bool = True,
 ) -> int:
     """Upload chunks to Qdrant in batches."""
     total_uploaded = 0
+    iterator = tqdm(range(0, len(chunks), batch_size), desc="Uploading to Qdrant", disable=not show_progress)
+    for i in iterator:
         batch = chunks[i : i + batch_size]
         uploaded = qdrant_service.upsert_chunks(batch)
         total_uploaded += uploaded
     return total_uploaded
     embedding_batch_size: int = 32,
     upload_batch_size: int = 100,
     create_collection: bool = True,
+    process_batch_size: int = 10000,
 ) -> dict[str, int]:
+    """Complete pipeline to index chunks to Qdrant.
+    Args:
+        chunks_file: Path to chunks JSON file
+        embedding_batch_size: Batch size for embedding generation
+        upload_batch_size: Batch size for Qdrant upload
+        create_collection: Whether to create the collection
+        process_batch_size: Process chunks in batches of this size to manage memory
+    """
     if chunks_file is None:
         chunks_file = Path(settings.root_dir) / "data" / "processed" / f"chunks_{settings.dataset_split}.json"
     else:
     if create_collection:
         qdrant_service.create_collection(vector_size=encoder.embedding_dim)
+    logger.info("Processing chunks in streaming batches to manage memory...")
+    total_uploaded = 0
+    batch_num = 0
+    for batch_chunks in load_chunks_generator(chunks_file, batch_size=process_batch_size):
+        batch_num += 1
+        batch_start = (batch_num - 1) * process_batch_size
+        batch_end = batch_start + len(batch_chunks)
+        logger.info(f"Batch {batch_num}: Embedding chunks {batch_start}-{batch_end} ({len(batch_chunks)} chunks)...")
+        batch_chunks = embed_chunks(
+            chunks=batch_chunks,
+            batch_size=embedding_batch_size,
+            show_progress=True,
+        )
+        logger.info(f"Batch {batch_num}: Uploading chunks {batch_start}-{batch_end} to Qdrant...")
+        batch_uploaded = index_chunks_to_qdrant(
+            chunks=batch_chunks,
+            qdrant_service=qdrant_service,
+            batch_size=upload_batch_size,
+            show_progress=True,
+        )
+        total_uploaded += batch_uploaded
+        logger.success(f"Batch {batch_num} complete: {batch_uploaded} chunks uploaded (Total: {total_uploaded})")
+    logger.info("Getting final statistics...")
     collection_info = qdrant_service.get_collection_info()
     stats = {
         "chunks_uploaded": total_uploaded,
         "collection_points": collection_info.get("points_count", 0),
         "collection_vectors": collection_info.get("index_vectors_count", 0),
     }
+    logger.success(f"Indexing complete: {stats}")
     return stats