Spaces:
Runtime error
Runtime error
File size: 4,729 Bytes
69832ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3
"""CLI script to ingest medical corpus into ChromaDB vector store.
Usage:
# Ingest seed corpus (JSON files from data/medical_corpus/)
python -m scripts.ingest
# Ingest with reset (clear existing data first)
python -m scripts.ingest --reset
# Scrape from a specific source
python -m scripts.ingest --scrape japi
# Ingest a PDF file
python -m scripts.ingest --pdf /path/to/textbook.pdf
# Show corpus statistics
python -m scripts.ingest --stats
"""
import argparse
import logging
import sys
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.core.rag.vector_store import MedicalVectorStore
from app.core.rag.retriever import MedicalRetriever
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger("ingest")
def main():
parser = argparse.ArgumentParser(description="Clinical-Mind RAG Corpus Ingestion")
parser.add_argument("--reset", action="store_true", help="Reset vector store before ingesting")
parser.add_argument("--stats", action="store_true", help="Show corpus statistics")
parser.add_argument("--scrape", type=str, help="Scrape source (japi, ijmr, ijem)")
parser.add_argument("--pdf", type=str, help="Ingest a PDF file")
parser.add_argument("--corpus-dir", type=str, help="Custom corpus directory path")
parser.add_argument("--query", type=str, help="Test query against the vector store")
parser.add_argument("--specialty", type=str, help="Filter by specialty (for query)")
args = parser.parse_args()
store = MedicalVectorStore()
retriever = MedicalRetriever(store)
if args.stats:
stats = retriever.get_corpus_stats()
print("\n=== Clinical-Mind RAG Corpus Statistics ===")
print(f" Total documents: {stats['total_documents']}")
print(f" Total cases: {stats['total_cases']}")
print(f" Specialties: {', '.join(stats['specialties'])}")
print(f" Status: {stats['status']}")
print()
return
if args.reset:
logger.info("Resetting vector store...")
store.reset()
logger.info("Vector store cleared")
if args.scrape:
from app.core.rag.scraper import MedicalCorpusScraper
scraper = MedicalCorpusScraper()
logger.info(f"Scraping from: {args.scrape}")
cases = scraper.scrape_source(args.scrape)
if cases:
scraper.save_scraped_cases(cases, f"{args.scrape}_scraped.json")
# Re-ingest all corpus (includes newly scraped)
count = store.ingest_corpus()
logger.info(f"Ingested {count} total chunks")
else:
logger.warning("No cases scraped. Check source URL and network connectivity.")
return
if args.pdf:
from app.core.rag.scraper import MedicalCorpusScraper
scraper = MedicalCorpusScraper()
logger.info(f"Processing PDF: {args.pdf}")
cases = scraper.ingest_pdf(args.pdf)
if cases:
pdf_name = Path(args.pdf).stem
scraper.save_scraped_cases(cases, f"pdf_{pdf_name}.json")
count = store.ingest_corpus()
logger.info(f"Ingested {count} total chunks")
else:
logger.warning("No cases extracted from PDF.")
return
if args.query:
logger.info(f"Querying: '{args.query}'")
results = store.query(
query_text=args.query,
specialty=args.specialty,
n_results=3,
)
print(f"\n=== Query Results ({len(results)} found) ===\n")
for i, r in enumerate(results, 1):
print(f"--- Result {i} (score: {r['relevance_score']:.3f}) ---")
print(f" Title: {r['metadata'].get('title', 'N/A')}")
print(f" Specialty: {r['metadata'].get('specialty', 'N/A')}")
print(f" Type: {r['metadata'].get('chunk_type', 'N/A')}")
print(f" Preview: {r['content'][:200]}...")
print()
return
# Default: ingest seed corpus
logger.info("Ingesting seed medical corpus into ChromaDB...")
corpus_dir = args.corpus_dir if args.corpus_dir else None
count = store.ingest_corpus(corpus_dir)
logger.info(f"Done! Ingested {count} document chunks")
# Show stats
stats = retriever.get_corpus_stats()
print(f"\n=== Ingestion Complete ===")
print(f" Documents: {stats['total_documents']}")
print(f" Cases: {stats['total_cases']}")
print(f" Specialties: {', '.join(stats['specialties'])}")
print()
if __name__ == "__main__":
main()
|