Spaces:
Sleeping
Sleeping
File size: 3,336 Bytes
50fec80 95f9304 50fec80 8ddd8e2 50fec80 8ddd8e2 50fec80 8ddd8e2 50fec80 8ddd8e2 50fec80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from pathlib import Path
from loguru import logger
import typer
from scientific_rag.scripts.chunk_data import chunk_data
from scientific_rag.scripts.index_qdrant import index_qdrant
app = typer.Typer(name="scientific-rag", help="Scientific RAG data pipeline CLI", add_completion=False)
@app.command()
def chunk(
batch_size: int = typer.Option(10000, "--batch-size", "-b", help="Papers per batch"),
) -> None:
"""Process papers and generate chunks."""
chunk_data(batch_size=batch_size)
@app.command()
def index(
chunks_file: str = typer.Option(None, "--chunks-file", "-f", help="Path to chunks JSON file"),
embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"),
) -> None:
"""Embed chunks and upload to Qdrant."""
chunks_path = Path(chunks_file) if chunks_file else None
index_qdrant(
chunks_file=chunks_path,
embedding_batch_size=embedding_batch_size,
upload_batch_size=upload_batch_size,
create_collection=create_collection,
process_batch_size=process_batch_size,
)
@app.command()
def pipeline(
chunk_batch_size: int = typer.Option(10000, "--chunk-batch-size", "-cb"),
embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"),
upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"),
create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"),
process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"),
) -> None:
"""Run complete pipeline: chunk → embed → index."""
logger.info("Step 1/2: Chunking data")
chunk_data(batch_size=chunk_batch_size)
logger.info("Step 2/2: Indexing to Qdrant")
index_qdrant(
chunks_file=None,
embedding_batch_size=embedding_batch_size,
upload_batch_size=upload_batch_size,
create_collection=create_collection,
process_batch_size=process_batch_size,
)
@app.command()
def info() -> None:
"""Show pipeline configuration and Qdrant status."""
from scientific_rag.infrastructure.qdrant import QdrantService
from scientific_rag.settings import settings
logger.info(f"Dataset: {settings.dataset_name} ({settings.dataset_split})")
logger.info(f"Chunk: size={settings.chunk_size}, overlap={settings.chunk_overlap}")
logger.info(f"Embedding: {settings.embedding_model_name}")
logger.info(f"Qdrant: {settings.qdrant_url} / {settings.qdrant_collection_name}")
try:
qdrant = QdrantService()
info = qdrant.get_collection_info()
if info["exists"]:
logger.info(
f"Collection: {info['points_count']} points, {info['indexed_vectors_count']} indexed vectors, status={info['status']}"
)
else:
logger.warning(f"Collection '{settings.qdrant_collection_name}' does not exist")
except Exception as e:
logger.error(f"Qdrant connection failed: {e}")
if __name__ == "__main__":
app()
|