File size: 4,729 Bytes
69832ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""CLI script to ingest medical corpus into ChromaDB vector store.

Usage:
    # Ingest seed corpus (JSON files from data/medical_corpus/)
    python -m scripts.ingest

    # Ingest with reset (clear existing data first)
    python -m scripts.ingest --reset

    # Scrape from a specific source
    python -m scripts.ingest --scrape japi

    # Ingest a PDF file
    python -m scripts.ingest --pdf /path/to/textbook.pdf

    # Show corpus statistics
    python -m scripts.ingest --stats
"""

import argparse
import logging
import sys
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from app.core.rag.vector_store import MedicalVectorStore
from app.core.rag.retriever import MedicalRetriever

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger("ingest")


def main():
    parser = argparse.ArgumentParser(description="Clinical-Mind RAG Corpus Ingestion")
    parser.add_argument("--reset", action="store_true", help="Reset vector store before ingesting")
    parser.add_argument("--stats", action="store_true", help="Show corpus statistics")
    parser.add_argument("--scrape", type=str, help="Scrape source (japi, ijmr, ijem)")
    parser.add_argument("--pdf", type=str, help="Ingest a PDF file")
    parser.add_argument("--corpus-dir", type=str, help="Custom corpus directory path")
    parser.add_argument("--query", type=str, help="Test query against the vector store")
    parser.add_argument("--specialty", type=str, help="Filter by specialty (for query)")
    args = parser.parse_args()

    store = MedicalVectorStore()
    retriever = MedicalRetriever(store)

    if args.stats:
        stats = retriever.get_corpus_stats()
        print("\n=== Clinical-Mind RAG Corpus Statistics ===")
        print(f"  Total documents:  {stats['total_documents']}")
        print(f"  Total cases:      {stats['total_cases']}")
        print(f"  Specialties:      {', '.join(stats['specialties'])}")
        print(f"  Status:           {stats['status']}")
        print()
        return

    if args.reset:
        logger.info("Resetting vector store...")
        store.reset()
        logger.info("Vector store cleared")

    if args.scrape:
        from app.core.rag.scraper import MedicalCorpusScraper

        scraper = MedicalCorpusScraper()
        logger.info(f"Scraping from: {args.scrape}")
        cases = scraper.scrape_source(args.scrape)
        if cases:
            scraper.save_scraped_cases(cases, f"{args.scrape}_scraped.json")
            # Re-ingest all corpus (includes newly scraped)
            count = store.ingest_corpus()
            logger.info(f"Ingested {count} total chunks")
        else:
            logger.warning("No cases scraped. Check source URL and network connectivity.")
        return

    if args.pdf:
        from app.core.rag.scraper import MedicalCorpusScraper

        scraper = MedicalCorpusScraper()
        logger.info(f"Processing PDF: {args.pdf}")
        cases = scraper.ingest_pdf(args.pdf)
        if cases:
            pdf_name = Path(args.pdf).stem
            scraper.save_scraped_cases(cases, f"pdf_{pdf_name}.json")
            count = store.ingest_corpus()
            logger.info(f"Ingested {count} total chunks")
        else:
            logger.warning("No cases extracted from PDF.")
        return

    if args.query:
        logger.info(f"Querying: '{args.query}'")
        results = store.query(
            query_text=args.query,
            specialty=args.specialty,
            n_results=3,
        )
        print(f"\n=== Query Results ({len(results)} found) ===\n")
        for i, r in enumerate(results, 1):
            print(f"--- Result {i} (score: {r['relevance_score']:.3f}) ---")
            print(f"  Title:     {r['metadata'].get('title', 'N/A')}")
            print(f"  Specialty: {r['metadata'].get('specialty', 'N/A')}")
            print(f"  Type:      {r['metadata'].get('chunk_type', 'N/A')}")
            print(f"  Preview:   {r['content'][:200]}...")
            print()
        return

    # Default: ingest seed corpus
    logger.info("Ingesting seed medical corpus into ChromaDB...")
    corpus_dir = args.corpus_dir if args.corpus_dir else None
    count = store.ingest_corpus(corpus_dir)
    logger.info(f"Done! Ingested {count} document chunks")

    # Show stats
    stats = retriever.get_corpus_stats()
    print(f"\n=== Ingestion Complete ===")
    print(f"  Documents: {stats['total_documents']}")
    print(f"  Cases:     {stats['total_cases']}")
    print(f"  Specialties: {', '.join(stats['specialties'])}")
    print()


if __name__ == "__main__":
    main()