Spaces:
Sleeping
Sleeping
Gemini
fix: Explicitly disable ChromaDB telemetry\n\n- Set anonymized_telemetry=False in ChromaDB client initialization in both chroma_utils.py and vector_repository.py to resolve persistent posthog errors.
fa21e69
| from typing import Optional, Dict, Any, List, Tuple | |
| import os | |
| import chromadb | |
| from chromadb.config import Settings | |
| import logging | |
| from lpm_kernel.configs.logging import get_train_process_logger | |
| logger = get_train_process_logger() | |
| def get_embedding_dimension(embedding: List[float]) -> int: | |
| """ | |
| Get the dimension of an embedding vector | |
| Args: | |
| embedding: The embedding vector | |
| Returns: | |
| The dimension of the embedding vector | |
| """ | |
| return len(embedding) | |
| def detect_embedding_model_dimension(model_name: str) -> Optional[int]: | |
| """ | |
| Detect the dimension of an embedding model based on its name | |
| This is a fallback method when we can't get a sample embedding | |
| Args: | |
| model_name: The name of the embedding model | |
| Returns: | |
| The dimension of the embedding model, or None if unknown | |
| """ | |
| # Common embedding model dimensions | |
| model_dimensions = { | |
| # OpenAI models | |
| "text-embedding-ada-002": 1536, | |
| "text-embedding-3-small": 1536, | |
| "text-embedding-3-large": 3072, | |
| # Ollama models | |
| "snowflake-arctic-embed": 768, | |
| "snowflake-arctic-embed:110m": 768, | |
| "nomic-embed-text": 768, | |
| "nomic-embed-text:v1.5": 768, | |
| "mxbai-embed-large": 1024, | |
| "mxbai-embed-large:v1": 1024, | |
| } | |
| # Try to find exact match | |
| if model_name in model_dimensions: | |
| return model_dimensions[model_name] | |
| # Try to find partial match | |
| for model, dimension in model_dimensions.items(): | |
| if model in model_name: | |
| return dimension | |
| # Default to OpenAI dimension if unknown | |
| logger.warning(f"Unknown embedding model: {model_name}, defaulting to 1536 dimensions") | |
| return 1536 | |
| def reinitialize_chroma_collections(dimension: int = 1536) -> bool: | |
| """ | |
| Reinitialize ChromaDB collections with a new dimension | |
| Args: | |
| dimension: The new dimension for the collections | |
| Returns: | |
| True if successful, False otherwise | |
| """ | |
| try: | |
| chroma_path = os.getenv("CHROMA_PERSIST_DIRECTORY", "./data/chroma_db") | |
| settings = Settings(anonymized_telemetry=False) | |
| client = chromadb.PersistentClient(path=chroma_path, settings=settings) | |
| # Delete and recreate document collection | |
| try: | |
| # Check if collection exists before attempting to delete | |
| try: | |
| client.get_collection(name="documents") | |
| client.delete_collection(name="documents") | |
| logger.info("Deleted 'documents' collection") | |
| except ValueError: | |
| logger.info("'documents' collection does not exist, will create new") | |
| except Exception as e: | |
| logger.error(f"Error deleting 'documents' collection: {str(e)}", exc_info=True) | |
| return False | |
| # Create document collection with new dimension | |
| try: | |
| client.create_collection( | |
| name="documents", | |
| metadata={ | |
| "hnsw:space": "cosine", | |
| "dimension": dimension | |
| } | |
| ) | |
| logger.info(f"Created 'documents' collection with dimension {dimension}") | |
| except Exception as e: | |
| logger.error(f"Error creating 'documents' collection: {str(e)}", exc_info=True) | |
| return False | |
| # Delete and recreate chunk collection | |
| try: | |
| # Check if collection exists before attempting to delete | |
| try: | |
| client.get_collection(name="document_chunks") | |
| client.delete_collection(name="document_chunks") | |
| logger.info("Deleted 'document_chunks' collection") | |
| except ValueError: | |
| logger.info("'document_chunks' collection does not exist, will create new") | |
| except Exception as e: | |
| logger.error(f"Error deleting 'document_chunks' collection: {str(e)}", exc_info=True) | |
| return False | |
| # Create chunk collection with new dimension | |
| try: | |
| client.create_collection( | |
| name="document_chunks", | |
| metadata={ | |
| "hnsw:space": "cosine", | |
| "dimension": dimension | |
| } | |
| ) | |
| logger.info(f"Created 'document_chunks' collection with dimension {dimension}") | |
| except Exception as e: | |
| logger.error(f"Error creating 'document_chunks' collection: {str(e)}", exc_info=True) | |
| return False | |
| # Verify collections were created with correct dimension | |
| try: | |
| doc_collection = client.get_collection(name="documents") | |
| chunk_collection = client.get_collection(name="document_chunks") | |
| doc_dimension = doc_collection.metadata.get("dimension") | |
| if doc_dimension != dimension: | |
| logger.error(f"Verification failed: 'documents' collection has incorrect dimension: {doc_dimension} vs {dimension}") | |
| return False | |
| chunk_dimension = chunk_collection.metadata.get("dimension") | |
| if chunk_dimension != dimension: | |
| logger.error(f"Verification failed: 'document_chunks' collection has incorrect dimension: {chunk_dimension} vs {dimension}") | |
| return False | |
| logger.info(f"Verification successful: Both collections have correct dimension: {dimension}") | |
| except Exception as e: | |
| logger.error(f"Error verifying collections: {str(e)}", exc_info=True) | |
| return False | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error reinitializing ChromaDB collections: {str(e)}", exc_info=True) | |
| return False |