#!/usr/bin/env python3 """CLI script to ingest medical corpus into ChromaDB vector store. Usage: # Ingest seed corpus (JSON files from data/medical_corpus/) python -m scripts.ingest # Ingest with reset (clear existing data first) python -m scripts.ingest --reset # Scrape from a specific source python -m scripts.ingest --scrape japi # Ingest a PDF file python -m scripts.ingest --pdf /path/to/textbook.pdf # Show corpus statistics python -m scripts.ingest --stats """ import argparse import logging import sys from pathlib import Path # Add backend to path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from app.core.rag.vector_store import MedicalVectorStore from app.core.rag.retriever import MedicalRetriever logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", ) logger = logging.getLogger("ingest") def main(): parser = argparse.ArgumentParser(description="Clinical-Mind RAG Corpus Ingestion") parser.add_argument("--reset", action="store_true", help="Reset vector store before ingesting") parser.add_argument("--stats", action="store_true", help="Show corpus statistics") parser.add_argument("--scrape", type=str, help="Scrape source (japi, ijmr, ijem)") parser.add_argument("--pdf", type=str, help="Ingest a PDF file") parser.add_argument("--corpus-dir", type=str, help="Custom corpus directory path") parser.add_argument("--query", type=str, help="Test query against the vector store") parser.add_argument("--specialty", type=str, help="Filter by specialty (for query)") args = parser.parse_args() store = MedicalVectorStore() retriever = MedicalRetriever(store) if args.stats: stats = retriever.get_corpus_stats() print("\n=== Clinical-Mind RAG Corpus Statistics ===") print(f" Total documents: {stats['total_documents']}") print(f" Total cases: {stats['total_cases']}") print(f" Specialties: {', '.join(stats['specialties'])}") print(f" Status: {stats['status']}") print() return if args.reset: logger.info("Resetting vector store...") store.reset() logger.info("Vector store cleared") if args.scrape: from app.core.rag.scraper import MedicalCorpusScraper scraper = MedicalCorpusScraper() logger.info(f"Scraping from: {args.scrape}") cases = scraper.scrape_source(args.scrape) if cases: scraper.save_scraped_cases(cases, f"{args.scrape}_scraped.json") # Re-ingest all corpus (includes newly scraped) count = store.ingest_corpus() logger.info(f"Ingested {count} total chunks") else: logger.warning("No cases scraped. Check source URL and network connectivity.") return if args.pdf: from app.core.rag.scraper import MedicalCorpusScraper scraper = MedicalCorpusScraper() logger.info(f"Processing PDF: {args.pdf}") cases = scraper.ingest_pdf(args.pdf) if cases: pdf_name = Path(args.pdf).stem scraper.save_scraped_cases(cases, f"pdf_{pdf_name}.json") count = store.ingest_corpus() logger.info(f"Ingested {count} total chunks") else: logger.warning("No cases extracted from PDF.") return if args.query: logger.info(f"Querying: '{args.query}'") results = store.query( query_text=args.query, specialty=args.specialty, n_results=3, ) print(f"\n=== Query Results ({len(results)} found) ===\n") for i, r in enumerate(results, 1): print(f"--- Result {i} (score: {r['relevance_score']:.3f}) ---") print(f" Title: {r['metadata'].get('title', 'N/A')}") print(f" Specialty: {r['metadata'].get('specialty', 'N/A')}") print(f" Type: {r['metadata'].get('chunk_type', 'N/A')}") print(f" Preview: {r['content'][:200]}...") print() return # Default: ingest seed corpus logger.info("Ingesting seed medical corpus into ChromaDB...") corpus_dir = args.corpus_dir if args.corpus_dir else None count = store.ingest_corpus(corpus_dir) logger.info(f"Done! Ingested {count} document chunks") # Show stats stats = retriever.get_corpus_stats() print(f"\n=== Ingestion Complete ===") print(f" Documents: {stats['total_documents']}") print(f" Cases: {stats['total_cases']}") print(f" Specialties: {', '.join(stats['specialties'])}") print() if __name__ == "__main__": main()