Spaces:

chaaim123
/

demo10

Runtime error

App Files Files Community

demo10 / utils /chroma_utils.py

chaaim123

Create utils/chroma_utils.py

c68c9dd verified 11 months ago

raw

history blame contribute delete

19.5 kB

	"""
	ChromaDBManager: A utility class for managing ChromaDB configurations, embeddings, and logging.
	This module provides an interface to configure and interact with a ChromaDB document store
	using Hugging Face embedding models. It supports persistent storage configuration, model selection,
	and logging setup. Designed for integration in local development environments or cloud deployments.
	Design Assumptions:
	- Configuration values (e.g., DB path, collection name) are loaded from a `.env` file.
	- Hugging Face API keys are stored securely in the macOS keychain.
	- Logging is configured per class and can output to both the console and file.
	- Embedding models are specified via a `models.txt` file, which is automatically created if missing.
	- The default models support a range of needs: small, medium, multilingual, and e5 variants.
	Core Logic:
	- Loads config values using `dotenv_values`.
	- Ensures the persistence path exists or is created.
	- Initializes a ChromaDB instance with the chosen embedding model.
	- Logs system and model configuration for traceability.
	- Supports both local and remote Hugging Face embeddings.
	Instructions for Use:
	1. Create a `.env` file in the repo root with keys like:
	CHROMA_DB_PATH=data/chroma_db
	CHROMA_DB_COLLECTION=documents
	2. Ensure your Hugging Face API key is stored in your keyring under the appropriate label.
	3. Run the script using the CLI, or import `ChromaDBManager` into your application.
	4. Optionally, configure the logging level via CLI argument:
	python -m chroma_db_manager --log-level DEBUG
	Important:
	To test the module, run it from the root directory using the `-m` flag:
	python -m chroma_db_manager
	Do not run the script directly from an IDE or its file path, or relative paths and module imports may break.
	Attributes:
	db_path (str): Path to the local ChromaDB storage.
	collection_name (str): Default ChromaDB collection name.
	model_mapping (dict): Maps user-friendly model names to Hugging Face model IDs.
	"""
	from dotenv import dotenv_values
	import os
	import sys
	import platform
	import logging
	import warnings
	import json
	import re
	from pathlib import Path
	import yaml

	# 3rd-party libraries
	import keyring
	import chromadb
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import login

	# Project utilities (absolute imports)
	from utils.metadata_utils import enhance_metadata
	from utils.logging_utils import setup_logging

	warnings.filterwarnings("ignore", category=FutureWarning)


	# Configure logging with debug mode from arguments
	logger = setup_logging(
	logger_name=__name__,
	log_filename=f"{Path(__file__).stem}.log"
	)

	class ChromaDBManager:
	_instance = None

	def _load_repo_configuration(self):
	"""
	Load configuration from the .env file and initialize db_path.
	"""
	# Load .env variables
	config = dotenv_values(".env")

	# Get the relative path from the environment variable
	relative_path = config.get("CHROMA_DB_PATH", "data/chroma_db")

	# Resolve the relative path to the project directory
	project_dir = Path(__file__).resolve().parent.parent # Assuming this script is inside the au_advisor folder
	self.db_path = project_dir / relative_path # Combine project directory with the relative path

	# Set the collection name
	self.collection_name = config.get("CHROMA_DB_COLLECTION", "documents")

	# Log the paths being used
	self.logger.info(f"Using Chroma DB path from .env: {self.db_path}")
	self.logger.info(f"Using default collection: {self.collection_name}")
	print(f"Using Chroma DB path from .env: {self.db_path}")
	print(f"Using default collection: {self.collection_name}")

	# Optionally ensure the DB path exists
	try:
	os.makedirs(self.db_path, exist_ok=True)
	except Exception as e:
	self.logger.warning(f"Failed to ensure DB path exists: {e}")

	# Return as a config dict only if needed elsewhere
	return {
	"db_path": str(self.db_path), # Return as string in case the path object needs to be used
	"custom_settings": {
	"default_collection": self.collection_name
	}
	}

	def _load_and_initialize_model(self, model_size="medium", models_file="models.txt"):
	"""
	Load model mapping and initialize the embedding model.
	Args:
	model_size (str): Size of the model to use. Defaults to "medium".
	models_file (str): Path to the models mapping file. Defaults to "models.txt".
	"""
	try:
	# Load the model mapping from the file
	model_mapping = self._load_model_mapping(models_file)

	# Validate the requested model size
	if model_size not in model_mapping:
	self.logger.warning(f"Model size '{model_size}' not found. Falling back to 'medium'.")
	model_size = "medium"

	# Get the model path
	model_path = model_mapping[model_size]
	self.model_name = model_path # Store the model name for logging

	# Initialize the SentenceTransformer model
	self.logger.info(f"Initializing embedding model: {model_path}")
	self.model = SentenceTransformer(model_path)

	# Optional: Log model details
	if hasattr(self.model, 'get_sentence_embedding_dimension'):
	embedding_dim = self.model.get_sentence_embedding_dimension()
	self.logger.info(f"Model embedding dimension: {embedding_dim}")

	except Exception as e:
	self.logger.error(f"Error initializing embedding model: {e}")
	# Fallback to a default model if initialization fails
	self.logger.warning("Falling back to default small model")
	default_model = "sentence-transformers/all-MiniLM-L6-v2"
	self.model = SentenceTransformer(default_model)
	self.model_name = default_model

	def _ensure_db_directory_exists(self):
	"""
	Ensure that the database directory exists. If it doesn't, create it.
	"""
	if not os.path.exists(self.db_path):
	try:
	os.makedirs(self.db_path)
	self.logger.info(f"Created database directory at: {self.db_path}")
	except Exception as e:
	self.logger.error(f"Error creating database directory: {e}")
	raise
	else:
	self.logger.info(f"Database directory already exists at: {self.db_path}")

	def __init__(self, model_size="medium", keys_file="keys.txt", models_file="models.txt",
	dataset_repo=None, db_path=None):
	"""
	Initialize the ChromaDBManager with optional overrides.
	"""
	if hasattr(self, '_initialized') and self._initialized:
	return

	self.logger = setup_logging(
	logger_name="ChromaDBManager",
	log_filename="ChromaDBManager.log",
	)
	self.logger.info("Initializing ChromaDBManager")

	# Load .env configuration
	env = dotenv_values(".env")

	self.db_path = db_path or env.get("CHROMA_DB_PATH", "data/chroma_db")
	self.collection_name = env.get("CHROMA_DB_COLLECTION", "documents")
	self.dataset_repo = dataset_repo or env.get("HF_DATASET_REPO")

	self.logger.info(f"Using Chroma DB path: {self.db_path}")
	self.logger.info(f"Default collection: {self.collection_name}")
	if self.dataset_repo:
	self.logger.info(f"Using dataset repository: {self.dataset_repo}")

	self._authenticate_huggingface(keys_file)
	self._ensure_db_directory_exists()
	self._load_and_initialize_model(model_size, models_file)

	self.client = chromadb.PersistentClient(path=self.db_path)
	self.collection = self.client.get_or_create_collection(name=self.collection_name)

	self.logger.info(f"ChromaDBManager initialized with model: {self.model_name}")
	self._initialized = True

	def _authenticate_huggingface(self, keys_file=None):
	"""
	Authenticate with Hugging Face using (in order of priority):
	1. Environment variable (HF_API_KEY, HF_TOKEN, HUGGINGFACE_TOKEN)
	2. macOS keyring (under "HF_API_KEY" and username "rressler")
	3. Local keys file (default: config/keys.txt)
	"""
	try:
	token = (
	os.environ.get("HF_API_KEY")
	or os.environ.get("HF_TOKEN")
	or os.environ.get("HUGGINGFACE_TOKEN")
	)

	# Try keyring only on macOS
	if not token and platform.system() == 'Darwin':
	try:
	token = keyring.get_password("HF_API_KEY", "rressler")
	if token:
	self.logger.info("Using Hugging Face API key from macOS keyring")
	except Exception as e:
	self.logger.warning(f"Keyring access failed: {e}")

	# Try keys file (default: config/keys.txt)
	if not token and keys_file:
	try:
	keys_path = Path(keys_file)
	if not keys_path.is_absolute():
	keys_path = Path(__file__).resolve().parent.parent / "config" / keys_path.name

	if keys_path.exists():
	with open(keys_path, "r") as f:
	for line in f:
	if line.strip().startswith("HF_API_KEY="):
	_, token = line.strip().split("=", 1)
	token = token.strip()
	if token and token != "your_api_key":
	self.logger.info("Using Hugging Face API key from keys file")
	break
	except Exception as e:
	self.logger.warning(f"Error reading keys file: {e}")

	# Try to login if we have a token
	if token:
	try:
	login(token=token)
	self.hf_token = token
	self.logger.info("Hugging Face authentication successful")
	return True
	except Exception as e:
	self.logger.error(f"Failed to authenticate with Hugging Face: {e}")
	else:
	self.logger.warning("No Hugging Face API token available from any source")

	except Exception as e:
	self.logger.error(f"Unexpected error during authentication: {e}")

	self.hf_token = None
	return False

	def _load_model_mapping(self, models_file="models.txt"):
	"""
	Load embedding model mapping from models.txt JSON file or create it if missing.
	Supports both local and Hugging Face deployment.
	"""
	default_mapping = {
	"small": "sentence-transformers/all-MiniLM-L6-v2",
	"medium": "sentence-transformers/all-mpnet-base-v2",
	"large": "sentence-transformers/all-roberta-large-v1",
	"multilingual": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
	"e5": "intfloat/e5-large-v2"
	}

	try:
	# Use a config directory relative to this file
	project_root = Path(__file__).resolve().parent.parent
	config_dir = project_root / "config"
	config_dir.mkdir(parents=True, exist_ok=True)

	models_path = config_dir / models_file

	if not models_path.exists():
	with models_path.open("w") as f:
	json.dump(default_mapping, f, indent=2)
	self.logger.info(f"Template models file created at {models_path}")
	return default_mapping

	with models_path.open("r") as f:
	model_mapping = json.load(f)

	if isinstance(model_mapping, dict) and model_mapping:
	self.logger.info(f"Loaded {len(model_mapping)} models from {models_path}")
	return model_mapping
	else:
	self.logger.warning(f"Invalid or empty model mapping in {models_path}. Using defaults.")
	return default_mapping

	except Exception as e:
	self.logger.error(f"Failed to load model mapping: {e}. Using defaults.")
	return default_mapping
	Print(f"Load model mapping: {e}. Using defaults.")

	def generate_valid_id(self, text):
	"""Sanitize the ID by removing special characters and limiting length."""
	if text is None:
	text = "untitled"

	# Remove non-alphanumeric chars
	sanitized_text = re.sub(r"[^\w\s]", "", str(text))

	# Replace spaces with underscores and limit length
	sanitized_text = sanitized_text.replace(" ", "_")[:20]

	return sanitized_text

	def get_collection(self, name="documents"):
	"""Get or create a collection by name."""
	try:
	collection = self.client.get_or_create_collection(name=name)
	print(f"✅ Collection '{name}' successfully loaded.")
	print(f"📄 Number of docs: {len(collection.get()['documents'])}")
	return collection
	except Exception as e:
	self.logger.error(f"Error getting/creating collection {name}: {e}")
	raise

	def embed_text(self, text):
	"""Generate embeddings for the given text."""
	try:
	# Convert text to a string and handle potential None input
	if text is None:
	text = ""

	# Generate embeddings
	embeddings = self.model.encode(str(text)).tolist()
	return embeddings
	except Exception as e:
	self.logger.error(f"Error generating embeddings: {e}")
	raise

	def add_document(self, text, metadata, doc_id=None, collection_name="documents"):
	"""
	Add a document to the specified collection with enhanced metadata.
	Args:
	text (str): Document text content to embed and store
	metadata (dict): Metadata associated with the document
	doc_id (str, optional): Document ID, generated if not provided
	collection_name (str, optional): Target collection name
	Returns:
	str: Document ID of the added document
	"""
	if not text.strip():
	raise ValueError("Cannot add an empty document.")

	collection = self.get_collection(collection_name)
	embedding = self.embed_text(text)

	# Generate or normalize doc_id
	title = metadata.get("title", "untitled")
	base_id = self.generate_valid_id(title)

	if doc_id is None:
	doc_id = f"{base_id}_{hash(text) % 10000}"

	# Enhance and log metadata
	enhanced_metadata = enhance_metadata(metadata)

	# Log key additions
	self.logger.debug(f"Enhanced metadata for document '{base_id}': {enhanced_metadata}")

	# Upsert into ChromaDB
	collection.upsert(
	documents=[text],
	embeddings=[embedding],
	metadatas=[enhanced_metadata],
	ids=[doc_id]
	)

	self.logger.info(f"Added document to '{collection_name}' with ID: {doc_id} \| Title: {title}")
	return doc_id

	def query(self, query_text, n_results=5, collection_name="documents"):
	"""Query the collection and return results."""
	collection = self.get_collection(collection_name)
	query_embedding = self.embed_text(query_text)

	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=n_results
	)

	return results

	# Create a singleton instance
	chroma_manager = ChromaDBManager()

	# Convenience functions
	def get_chroma_manager(model_size="medium", keys_file="keys.txt", models_file="models.txt", db_path=None):
	"""
	Get the ChromaDBManager singleton instance with specified configuration.
	If the instance already exists, returns it without reinitializing.

	Args:
	model_size (str, optional): Size of the embedding model.
	keys_file (str, optional): Path to the keys file.
	models_file (str, optional): Path to the models mapping file.
	db_path (str, optional): Path to the ChromaDB database.
	"""
	# Check if the instance already exists
	if hasattr(get_chroma_manager, '_instance') and get_chroma_manager._instance is not None:
	return get_chroma_manager._instance

	# Create a new instance with the specified configuration
	instance = ChromaDBManager(
	model_size=model_size,
	keys_file=keys_file,
	models_file=models_file,
	db_path=db_path
	)

	# Store the instance as a static variable
	get_chroma_manager._instance = instance

	return instance

	def query_documents(query_text, n_results=3):
	"""Query documents in the default collection."""
	return chroma_manager.query(query_text, n_results)

	def add_document(text, metadata, doc_id=None):
	"""Add a document to the default collection."""
	return chroma_manager.add_document(text, metadata, doc_id)

	def setup_logger(level=logging.INFO):
	root_logger = logging.getLogger()
	if not root_logger.handlers:
	logging.basicConfig(
	level=level,
	format="%(asctime)s [%(levelname)s] %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)
	else:
	# Just set the level if already configured elsewhere
	root_logger.setLevel(level)

	def init_chroma_db_manager(config: dict) -> ChromaDBManager:
	"""
	Convenience function to initialize ChromaDBManager using a config dict.
	Intended for external scripts using YAML configuration.
	"""
	return ChromaDBManager(
	model_size=config.get("model", "medium"),
	keys_file=config.get("keys_file", "keys.txt"),
	models_file=config.get("models_file", "models.txt"),
	dataset_repo=config.get("repo"), # Optional: for HF datasets
	db_path=config.get("db_path") # Optional: override .env default
	)

	def main():
	# Load config YAML for logging level if needed
	config_path = "config/chroma_config.yml"
	try:
	with open(config_path, 'r') as file:
	config = yaml.safe_load(file) or {}
	log_level = config.get('log_level', 'INFO').upper() # Default to INFO if not set in YAML
	except Exception as e:
	log_level = 'INFO' # Default to INFO if there is an error loading config
	print(f"Could not load {config_path}, using default log level {log_level}: {e}")

	# Set up logging
	setup_logger(level=log_level)

	# Initialize ChromaDBManager (No CLI args, just configuration file)
	chroma = ChromaDBManager()

	# Example of using ChromaDBManager
	print("ChromaDBManager initialized successfully.")

	if __name__ == "__main__":
	main()