File size: 4,274 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import chromadb
import os
import sys

# Add project root to path to import from lpm_kernel
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension, reinitialize_chroma_collections

def init_chroma_db():
    chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
    
    # ensure the directory is correct
    os.makedirs(chroma_path, exist_ok=True)

    # Get embedding model dimension from user config
    try:
        user_llm_config_service = UserLLMConfigService()
        user_llm_config = user_llm_config_service.get_available_llm()
        
        if user_llm_config and user_llm_config.embedding_model_name:
            # Detect dimension based on model name
            dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
            print(f"Detected embedding dimension: {dimension} for model: {user_llm_config.embedding_model_name}")
        else:
            # Default to OpenAI dimension if no config found
            dimension = 1536
            print(f"No embedding model configured, using default dimension: {dimension}")
    except Exception as e:
        # Default to OpenAI dimension if error occurs
        dimension = 1536
        print(f"Error detecting embedding dimension, using default: {dimension}. Error: {e}")

    try:
        client = chromadb.PersistentClient(path=chroma_path)
        collections_to_init = ["documents", "document_chunks"]
        dimension_mismatch_detected = False
        
        # Check all collections for dimension mismatches first
        for collection_name in collections_to_init:
            try:
                collection = client.get_collection(name=collection_name)
                print(f"Collection '{collection_name}' already exists")
                
                # Check if existing collection has the correct dimension
                if collection.metadata.get("dimension") != dimension:
                    print(f"Warning: Existing '{collection_name}' collection has dimension {collection.metadata.get('dimension')}, but current model requires {dimension}")
                    dimension_mismatch_detected = True
            except ValueError:
                # Collection doesn't exist yet, will be created later
                pass
        
        # Handle dimension mismatch if detected in any collection
        if dimension_mismatch_detected:
            print("Automatically reinitializing ChromaDB collections with the new dimension...")
            if reinitialize_chroma_collections(dimension):
                print("Successfully reinitialized ChromaDB collections with the new dimension")
            else:
                print("Failed to reinitialize ChromaDB collections, you may need to manually delete the data/chroma_db directory")
        
        # Create or get collections with the correct dimension
        for collection_name in collections_to_init:
            try:
                collection = client.get_collection(name=collection_name)
                # Verify dimension after possible reinitialization
                if collection.metadata.get("dimension") != dimension:
                    print(f"Error: Collection '{collection_name}' still has incorrect dimension after reinitialization: {collection.metadata.get('dimension')} vs {dimension}")
            except ValueError:
                # Create collection if it doesn't exist
                collection = client.create_collection(
                    name=collection_name,
                    metadata={
                        "hnsw:space": "cosine",
                        "dimension": dimension
                    }
                )
                print(f"Successfully created collection '{collection_name}' with dimension {dimension}")

        
        print(f"ChromaDB initialized at {chroma_path}")
    except Exception as e:
        print(f"An error occurred while initializing ChromaDB: {e}")
        # no exception for following process
        # ChromaRepository will create collection if needed

if __name__ == "__main__":
    init_chroma_db()