secondme-api / docker /app /init_chroma.py
Gemini
feat: add detailed logging
01d5a5d
import chromadb
import os
import sys
# Add project root to path to import from lpm_kernel
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from lpm_kernel.api.services.user_llm_config_service import UserLLMConfigService
from lpm_kernel.file_data.chroma_utils import detect_embedding_model_dimension, reinitialize_chroma_collections
def init_chroma_db():
chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db")
# ensure the directory is correct
os.makedirs(chroma_path, exist_ok=True)
# Get embedding model dimension from user config
try:
user_llm_config_service = UserLLMConfigService()
user_llm_config = user_llm_config_service.get_available_llm()
if user_llm_config and user_llm_config.embedding_model_name:
# Detect dimension based on model name
dimension = detect_embedding_model_dimension(user_llm_config.embedding_model_name)
print(f"Detected embedding dimension: {dimension} for model: {user_llm_config.embedding_model_name}")
else:
# Default to OpenAI dimension if no config found
dimension = 1536
print(f"No embedding model configured, using default dimension: {dimension}")
except Exception as e:
# Default to OpenAI dimension if error occurs
dimension = 1536
print(f"Error detecting embedding dimension, using default: {dimension}. Error: {e}")
try:
client = chromadb.PersistentClient(path=chroma_path)
collections_to_init = ["documents", "document_chunks"]
dimension_mismatch_detected = False
# Check all collections for dimension mismatches first
for collection_name in collections_to_init:
try:
collection = client.get_collection(name=collection_name)
print(f"Collection '{collection_name}' already exists")
# Check if existing collection has the correct dimension
if collection.metadata.get("dimension") != dimension:
print(f"Warning: Existing '{collection_name}' collection has dimension {collection.metadata.get('dimension')}, but current model requires {dimension}")
dimension_mismatch_detected = True
except ValueError:
# Collection doesn't exist yet, will be created later
pass
# Handle dimension mismatch if detected in any collection
if dimension_mismatch_detected:
print("Automatically reinitializing ChromaDB collections with the new dimension...")
if reinitialize_chroma_collections(dimension):
print("Successfully reinitialized ChromaDB collections with the new dimension")
else:
print("Failed to reinitialize ChromaDB collections, you may need to manually delete the data/chroma_db directory")
# Create or get collections with the correct dimension
for collection_name in collections_to_init:
try:
collection = client.get_collection(name=collection_name)
# Verify dimension after possible reinitialization
if collection.metadata.get("dimension") != dimension:
print(f"Error: Collection '{collection_name}' still has incorrect dimension after reinitialization: {collection.metadata.get('dimension')} vs {dimension}")
except ValueError:
# Create collection if it doesn't exist
collection = client.create_collection(
name=collection_name,
metadata={
"hnsw:space": "cosine",
"dimension": dimension
}
)
print(f"Successfully created collection '{collection_name}' with dimension {dimension}")
print(f"ChromaDB initialized at {chroma_path}")
except Exception as e:
print(f"An error occurred while initializing ChromaDB: {e}")
# no exception for following process
# ChromaRepository will create collection if needed
if __name__ == "__main__":
init_chroma_db()