File size: 4,064 Bytes
8aa3867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117b594
 
 
8aa3867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import json
import chromadb
from chromadb.utils import embedding_functions

class MindGuardVectorDB:
    """
    This class handles the ingestion of clinical guidelines (text) 
    and converts them into mathematical embeddings stored in ChromaDB.
    """
    def __init__(self):
        print("🗄️ Initializing MindGuard Vector Database Builder...")
        
        # --- STRICT ARCHITECTURE PATHING ---
        self.script_dir = os.path.dirname(os.path.abspath(__file__))
        self.project_root = os.path.abspath(os.path.join(self.script_dir, "../../"))
        
        # Paths aligned perfectly with the folder directory
        self.knowledge_base_path = os.path.join(self.project_root, "data", "knowledge_base", "coping_strategies.json")
        self.chroma_db_dir = os.path.join(self.project_root, "artifacts", "chroma_db")
        
        # Ensure the Chroma DB output folder exists
        os.makedirs(self.chroma_db_dir, exist_ok=True)
        
        # --- INITIALIZE CHROMADB ---
        # PersistentClient saves the database directly to your hard drive so you don't lose it when the script stops
        self.chroma_client = chromadb.PersistentClient(path=self.chroma_db_dir)
        
        # Initialize the Embedding Engine (all-MiniLM-L6-v2)
        # This is a small, lightning-fast Hugging Face model that turns sentences into math vectors
        self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="BAAI/bge-base-en-v1.5"
        )
        
        # Create or load the 'clinical_guidelines' collection
        self.collection = self.chroma_client.get_or_create_collection(
            name="clinical_guidelines",
            embedding_function=self.embedding_fn
        )
        print(f"✅ Connected to ChromaDB at: {self.chroma_db_dir}")

    def build_database(self):
        """Reads the JSON file and mathematically embeds it into the database."""
        print(f"📖 Reading clinical data from: {self.knowledge_base_path}...")
        
        # 1. Read the JSON file
        with open(self.knowledge_base_path, 'r', encoding='utf-8') as file:
            cbt_data = json.load(file)
            
        # 2. Prepare lists for ChromaDB insertion
        documents = []
        metadatas = []
        ids = []
        
        # 3. Parse the generalized data
        # --- THE FIX: Removed 'enumerate' because we are using our own IDs now ---
        for strategy in cbt_data:
            # The actual text the LLM will read
            documents.append(strategy["content"])
            
            # --- THE FIX: Rich metadata for advanced filtering later ---
            # We map to the exact keys in our new upgraded JSON schema
            metadatas.append({
                "emotion": strategy["primary_emotion"],
                "risk_level": strategy["target_risk_level"],
                "category": strategy["category"],
                "strategy": strategy["strategy_name"],
                # --- THE FIX: Convert the list of tags into a single comma-separated string for ChromaDB ---
                "tags": ", ".join(strategy["tags"]) 
            })
            
            # --- THE FIX: Use our custom generalized ID from the JSON instead of generating a random one ---
            ids.append(strategy["id"])
            
        print("⚙️ Embedding text into mathematical vectors... (This may take a moment to download the model on the first run)")
        
        # 4. Inject into the Vector Database
        # Upsert means "Update or Insert" - it prevents duplicates if you run this script twice
        self.collection.upsert(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        
        print(f"✅ Successfully embedded {len(documents)} clinical coping strategies into ChromaDB!")
        print("The RAG Knowledge Base is now primed and ready for the Retriever.")

# --- EXECUTION BLOCK ---
if __name__ == "__main__":
    db_builder = MindGuardVectorDB()
    db_builder.build_database()