Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from loguru import logger | |
| import typer | |
| from scientific_rag.scripts.chunk_data import chunk_data | |
| from scientific_rag.scripts.index_qdrant import index_qdrant | |
| app = typer.Typer(name="scientific-rag", help="Scientific RAG data pipeline CLI", add_completion=False) | |
| def chunk( | |
| batch_size: int = typer.Option(10000, "--batch-size", "-b", help="Papers per batch"), | |
| ) -> None: | |
| """Process papers and generate chunks.""" | |
| chunk_data(batch_size=batch_size) | |
| def index( | |
| chunks_file: str = typer.Option(None, "--chunks-file", "-f", help="Path to chunks JSON file"), | |
| embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"), | |
| upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"), | |
| create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"), | |
| process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"), | |
| ) -> None: | |
| """Embed chunks and upload to Qdrant.""" | |
| chunks_path = Path(chunks_file) if chunks_file else None | |
| index_qdrant( | |
| chunks_file=chunks_path, | |
| embedding_batch_size=embedding_batch_size, | |
| upload_batch_size=upload_batch_size, | |
| create_collection=create_collection, | |
| process_batch_size=process_batch_size, | |
| ) | |
| def pipeline( | |
| chunk_batch_size: int = typer.Option(10000, "--chunk-batch-size", "-cb"), | |
| embedding_batch_size: int = typer.Option(32, "--embedding-batch-size", "-eb"), | |
| upload_batch_size: int = typer.Option(100, "--upload-batch-size", "-ub"), | |
| create_collection: bool = typer.Option(True, "--create-collection/--no-create-collection"), | |
| process_batch_size: int = typer.Option(10000, "--process-batch-size", "-pb", help="Process chunks in batches"), | |
| ) -> None: | |
| """Run complete pipeline: chunk → embed → index.""" | |
| logger.info("Step 1/2: Chunking data") | |
| chunk_data(batch_size=chunk_batch_size) | |
| logger.info("Step 2/2: Indexing to Qdrant") | |
| index_qdrant( | |
| chunks_file=None, | |
| embedding_batch_size=embedding_batch_size, | |
| upload_batch_size=upload_batch_size, | |
| create_collection=create_collection, | |
| process_batch_size=process_batch_size, | |
| ) | |
| def info() -> None: | |
| """Show pipeline configuration and Qdrant status.""" | |
| from scientific_rag.infrastructure.qdrant import QdrantService | |
| from scientific_rag.settings import settings | |
| logger.info(f"Dataset: {settings.dataset_name} ({settings.dataset_split})") | |
| logger.info(f"Chunk: size={settings.chunk_size}, overlap={settings.chunk_overlap}") | |
| logger.info(f"Embedding: {settings.embedding_model_name}") | |
| logger.info(f"Qdrant: {settings.qdrant_url} / {settings.qdrant_collection_name}") | |
| try: | |
| qdrant = QdrantService() | |
| info = qdrant.get_collection_info() | |
| if info["exists"]: | |
| logger.info( | |
| f"Collection: {info['points_count']} points, {info['indexed_vectors_count']} indexed vectors, status={info['status']}" | |
| ) | |
| else: | |
| logger.warning(f"Collection '{settings.qdrant_collection_name}' does not exist") | |
| except Exception as e: | |
| logger.error(f"Qdrant connection failed: {e}") | |
| if __name__ == "__main__": | |
| app() | |