Spaces:

Sefaria
/

Rabbinic-Embedding-Bench

Running

Lev Israel

Embedding Gemma

d1c390a 5 days ago

42.7 kB

	"""
	Model loading and embedding interface for the Rabbinic embedding benchmark.

	Supports:
	- Curated models from Hugging Face (sentence-transformers)
	- Any Hugging Face sentence-transformer model
	- API-based models (OpenAI, Voyage AI, Google Gemini)
	"""

	import os
	from abc import ABC, abstractmethod
	from typing import Optional
	import numpy as np

	# Curated local models known to work well for multilingual tasks
	CURATED_MODELS = {
	"intfloat/multilingual-e5-large": {
	"name": "Multilingual E5 Large",
	"description": "Strong multilingual model from Microsoft, 560M params",
	"type": "local",
	"query_prefix": "query: ",
	"passage_prefix": "passage: ",
	},
	"intfloat/multilingual-e5-base": {
	"name": "Multilingual E5 Base",
	"description": "Smaller multilingual E5, 278M params",
	"type": "local",
	"query_prefix": "query: ",
	"passage_prefix": "passage: ",
	},
	"sentence-transformers/paraphrase-multilingual-mpnet-base-v2": {
	"name": "Multilingual MPNet",
	"description": "Classic multilingual sentence transformer, 278M params",
	"type": "local",
	"query_prefix": "",
	"passage_prefix": "",
	},
	"BAAI/bge-m3": {
	"name": "BGE-M3",
	"description": "Multi-lingual, multi-functionality, multi-granularity model from BAAI",
	"type": "local",
	"query_prefix": "",
	"passage_prefix": "",
	},
	"intfloat/e5-mistral-7b-instruct": {
	"name": "E5 Mistral 7B",
	"description": "Large instruction-tuned embedding model, 7B params (requires GPU)",
	"type": "local",
	"query_prefix": "Instruct: Retrieve semantically similar text\nQuery: ",
	"passage_prefix": "",
	},
	"Alibaba-NLP/gte-multilingual-base": {
	"name": "GTE Multilingual Base",
	"description": "General Text Embeddings multilingual model from Alibaba",
	"type": "local",
	"query_prefix": "",
	"passage_prefix": "",
	},
	"google/embeddinggemma-300m": {
	"name": "EmbeddingGemma",
	"description": "Google's 300M param embedding model, 100+ languages, 768d (requires HF token + license)",
	"type": "local",
	"query_prefix": "task: search result \| query: ",
	"passage_prefix": "title: none \| text: ",
	"max_length": 2048,
	},
	}

	# API-based models
	API_MODELS = {
	"openai/text-embedding-3-large": {
	"name": "OpenAI text-embedding-3-large",
	"description": "OpenAI's best embedding model, 3072 dimensions (API key required)",
	"type": "openai",
	"model_name": "text-embedding-3-large",
	"dimensions": 3072,
	},
	"openai/text-embedding-3-small": {
	"name": "OpenAI text-embedding-3-small",
	"description": "OpenAI's efficient embedding model, 1536 dimensions (API key required)",
	"type": "openai",
	"model_name": "text-embedding-3-small",
	"dimensions": 1536,
	},
	"openai/text-embedding-ada-002": {
	"name": "OpenAI Ada 002",
	"description": "OpenAI's legacy embedding model, 1536 dimensions (API key required)",
	"type": "openai",
	"model_name": "text-embedding-ada-002",
	"dimensions": 1536,
	},
	"voyage/voyage-3.5": {
	"name": "Voyage AI voyage-3.5",
	"description": "Voyage AI's latest embedding model (API key required)",
	"type": "voyage",
	"model_name": "voyage-3.5",
	"dimensions": 1024,
	},
	"voyage/voyage-3.5-lite": {
	"name": "Voyage AI voyage-3.5-lite",
	"description": "Voyage AI's efficient embedding model (API key required)",
	"type": "voyage",
	"model_name": "voyage-3.5-lite",
	"dimensions": 1024,
	},
	"voyage/voyage-3": {
	"name": "Voyage AI voyage-3",
	"description": "Voyage AI's general purpose embedding model (API key required)",
	"type": "voyage",
	"model_name": "voyage-3",
	"dimensions": 1024,
	},
	"voyage/voyage-3-lite": {
	"name": "Voyage AI voyage-3-lite",
	"description": "Voyage AI's lightweight embedding model (API key required)",
	"type": "voyage",
	"model_name": "voyage-3-lite",
	"dimensions": 512,
	},
	"voyage/voyage-multilingual-2": {
	"name": "Voyage AI voyage-multilingual-2",
	"description": "Voyage AI's multilingual embedding model, optimized for non-English (API key required)",
	"type": "voyage",
	"model_name": "voyage-multilingual-2",
	"dimensions": 1024,
	},
	"gemini/gemini-embedding-001": {
	"name": "Gemini Embedding 001",
	"description": "Google's Gemini embedding model, 3072 dimensions (API key required)",
	"type": "gemini",
	"model_name": "gemini-embedding-001",
	"dimensions": 3072,
	},
	"gemini/gemini-embedding-001-768": {
	"name": "Gemini Embedding 001 (768d)",
	"description": "Google's Gemini embedding model, 768 dimensions (API key required)",
	"type": "gemini",
	"model_name": "gemini-embedding-001",
	"dimensions": 768,
	},
	"gemini/gemini-embedding-001-1536": {
	"name": "Gemini Embedding 001 (1536d)",
	"description": "Google's Gemini embedding model, 1536 dimensions (API key required)",
	"type": "gemini",
	"model_name": "gemini-embedding-001",
	"dimensions": 1536,
	},
	"cohere/embed-multilingual-v3.0": {
	"name": "Cohere embed-multilingual-v3.0",
	"description": "Cohere's multilingual embedding model, 100+ languages (API key required)",
	"type": "cohere",
	"model_name": "embed-multilingual-v3.0",
	"dimensions": 1024,
	},
	"cohere/embed-multilingual-light-v3.0": {
	"name": "Cohere embed-multilingual-light-v3.0",
	"description": "Cohere's lightweight multilingual model (API key required)",
	"type": "cohere",
	"model_name": "embed-multilingual-light-v3.0",
	"dimensions": 384,
	},
	}

	# Merge all models for easy lookup
	ALL_MODELS = {CURATED_MODELS, API_MODELS}


	class BaseEmbeddingModel(ABC):
	"""Abstract base class for embedding models."""

	model_id: str
	embedding_dim: int

	@abstractmethod
	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 32,
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""Encode texts to embeddings."""
	pass

	@property
	@abstractmethod
	def name(self) -> str:
	"""Get display name for the model."""
	pass

	@property
	@abstractmethod
	def description(self) -> str:
	"""Get description for the model."""
	pass

	def encode_pairs(
	self,
	he_texts: list[str],
	en_texts: list[str],
	batch_size: int = 32,
	show_progress: bool = True,
	) -> tuple[np.ndarray, np.ndarray]:
	"""
	Encode parallel Hebrew/English text pairs.

	Args:
	he_texts: Hebrew/Aramaic source texts
	en_texts: English translations
	batch_size: Batch size for encoding
	show_progress: Whether to show progress bar

	Returns:
	Tuple of (hebrew_embeddings, english_embeddings)
	"""
	he_embeddings = self.encode(
	he_texts,
	is_query=True,
	batch_size=batch_size,
	show_progress=show_progress,
	)

	en_embeddings = self.encode(
	en_texts,
	is_query=False,
	batch_size=batch_size,
	show_progress=show_progress,
	)

	return he_embeddings, en_embeddings


	class EmbeddingModel(BaseEmbeddingModel):
	"""
	Wrapper for sentence-transformer models with consistent interface.
	"""

	def __init__(
	self,
	model_id: str,
	device: Optional[str] = None,
	max_length: int = 512,
	hf_token: Optional[str] = None,
	):
	"""
	Initialize the embedding model.

	Args:
	model_id: Hugging Face model ID
	device: Device to use ('cuda', 'cpu', or None for auto)
	max_length: Maximum sequence length for tokenization
	hf_token: HuggingFace token for gated models (or uses HF_TOKEN env var)
	"""
	from sentence_transformers import SentenceTransformer
	import torch

	self.model_id = model_id

	# Auto-detect device
	if device is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	self.device = device

	# Get model config if it's a curated model
	self.config = CURATED_MODELS.get(model_id, {
	"name": model_id.split("/")[-1],
	"description": "Custom model",
	"type": "local",
	"query_prefix": "",
	"passage_prefix": "",
	})

	# Use config max_length if available, otherwise use parameter
	self.max_length = self.config.get("max_length", max_length)

	# Get HF token from parameter or environment (for gated models like EmbeddingGemma)
	hf_token = hf_token or os.environ.get("HF_TOKEN")

	# Load the model with float16 on CUDA to save VRAM
	# (12B model: float32 = 48GB, float16 = 24GB)
	print(f"Loading model: {model_id} on {device}")

	# Only trust remote code from known publishers (security measure)
	trusted_publishers = ["nvidia/", "google/"]
	trust_remote_code = any(model_id.startswith(pub) for pub in trusted_publishers)

	if device == "cuda":
	self.model = SentenceTransformer(
	model_id,
	device=device,
	model_kwargs={"torch_dtype": torch.float16},
	trust_remote_code=trust_remote_code,
	token=hf_token,
	)
	else:
	self.model = SentenceTransformer(
	model_id,
	device=device,
	trust_remote_code=trust_remote_code,
	token=hf_token,
	)

	# Set max sequence length if supported
	if hasattr(self.model, "max_seq_length"):
	self.model.max_seq_length = min(self.max_length, self.model.max_seq_length)

	self.embedding_dim = self.model.get_sentence_embedding_dimension()
	print(f"Model loaded. Embedding dimension: {self.embedding_dim}")

	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 32,
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""
	Encode texts to embeddings.

	Args:
	texts: List of texts to encode
	is_query: Whether these are queries (vs passages) for asymmetric models
	batch_size: Batch size for encoding
	show_progress: Whether to show progress bar
	normalize: Whether to L2-normalize embeddings

	Returns:
	numpy array of shape (len(texts), embedding_dim)
	"""
	# Add prefix if needed (for E5-style models)
	prefix = self.config["query_prefix"] if is_query else self.config["passage_prefix"]
	if prefix:
	texts = [prefix + t for t in texts]

	embeddings = self.model.encode(
	texts,
	batch_size=batch_size,
	show_progress_bar=show_progress,
	normalize_embeddings=normalize,
	convert_to_numpy=True,
	)

	return embeddings

	@property
	def name(self) -> str:
	"""Get display name for the model."""
	return self.config.get("name", self.model_id)

	@property
	def description(self) -> str:
	"""Get description for the model."""
	return self.config.get("description", "")


	class OpenAIEmbeddingModel(BaseEmbeddingModel):
	"""
	Wrapper for OpenAI embedding API with consistent interface.
	"""

	# OpenAI embedding models have an 8191 token limit
	MAX_TOKENS = 8191

	def __init__(
	self,
	model_id: str,
	api_key: Optional[str] = None,
	):
	"""
	Initialize the OpenAI embedding model.

	Args:
	model_id: Model ID in format 'openai/model-name'
	api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
	"""
	try:
	from openai import OpenAI
	except ImportError:
	raise ImportError(
	"OpenAI package not installed. Install with: pip install openai"
	)

	self.model_id = model_id

	# Get API key from parameter or environment
	api_key = api_key or os.environ.get("OPENAI_API_KEY")
	if not api_key:
	raise ValueError(
	"OpenAI API key required. Set OPENAI_API_KEY environment variable "
	"or pass api_key parameter."
	)

	self.client = OpenAI(api_key=api_key)

	# Get model config
	self.config = API_MODELS.get(model_id, {
	"name": model_id,
	"description": "OpenAI embedding model",
	"type": "openai",
	"model_name": model_id.replace("openai/", ""),
	"dimensions": 1536,
	})

	self._model_name = self.config["model_name"]
	self.embedding_dim = self.config["dimensions"]

	# Initialize tokenizer for truncation
	self._encoding = None
	try:
	import tiktoken
	self._encoding = tiktoken.encoding_for_model(self._model_name)
	except Exception:
	# Fall back to cl100k_base which is used by embedding models
	try:
	import tiktoken
	self._encoding = tiktoken.get_encoding("cl100k_base")
	except Exception:
	print("Warning: tiktoken not available, using character-based truncation")

	print(f"Initialized OpenAI embedding model: {self._model_name}")
	print(f"Embedding dimension: {self.embedding_dim}")

	def _truncate_text(self, text: str) -> str:
	"""Truncate text to fit within token limit."""
	if self._encoding is not None:
	# Use tiktoken for accurate token counting
	tokens = self._encoding.encode(text)
	if len(tokens) > self.MAX_TOKENS:
	tokens = tokens[:self.MAX_TOKENS]
	return self._encoding.decode(tokens)
	return text
	else:
	# Fallback: rough character-based truncation
	# Assume ~3 chars per token for Hebrew/mixed text (conservative)
	max_chars = self.MAX_TOKENS * 3
	if len(text) > max_chars:
	return text[:max_chars]
	return text

	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 100, # OpenAI supports larger batches
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""
	Encode texts to embeddings using OpenAI API.

	Args:
	texts: List of texts to encode
	is_query: Not used for OpenAI (symmetric embeddings)
	batch_size: Batch size for API calls
	show_progress: Whether to show progress bar
	normalize: Whether to L2-normalize embeddings (OpenAI already normalizes)

	Returns:
	numpy array of shape (len(texts), embedding_dim)
	"""
	import time

	all_embeddings = []
	total_batches = (len(texts) + batch_size - 1) // batch_size

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_num = i // batch_size + 1

	if show_progress:
	print(f" Encoding batch {batch_num}/{total_batches}...")

	# Retry logic for API calls
	max_retries = 3
	for attempt in range(max_retries):
	try:
	response = self.client.embeddings.create(
	model=self._model_name,
	input=batch,
	)

	# Extract embeddings from response
	batch_embeddings = [item.embedding for item in response.data]
	all_embeddings.extend(batch_embeddings)
	break

	except Exception as e:
	if attempt < max_retries - 1:
	wait_time = 2 ** attempt
	print(f" API error, retrying in {wait_time}s: {e}")
	time.sleep(wait_time)
	else:
	raise RuntimeError(f"OpenAI API error after {max_retries} retries: {e}")

	# Small delay to avoid rate limits
	if i + batch_size < len(texts):
	time.sleep(0.1)

	embeddings = np.array(all_embeddings, dtype=np.float32)

	# OpenAI embeddings are already normalized, but normalize if requested
	if normalize:
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings = embeddings / np.maximum(norms, 1e-10)

	return embeddings

	@property
	def name(self) -> str:
	"""Get display name for the model."""
	return self.config.get("name", self.model_id)

	@property
	def description(self) -> str:
	"""Get description for the model."""
	return self.config.get("description", "")


	class VoyageEmbeddingModel(BaseEmbeddingModel):
	"""
	Wrapper for Voyage AI embedding API with consistent interface.
	"""

	def __init__(
	self,
	model_id: str,
	api_key: Optional[str] = None,
	):
	"""
	Initialize the Voyage AI embedding model.

	Args:
	model_id: Model ID in format 'voyage/model-name'
	api_key: Voyage API key (or uses VOYAGE_API_KEY env var)
	"""
	try:
	import voyageai
	except ImportError:
	raise ImportError(
	"Voyage AI package not installed. Install with: pip install voyageai"
	)

	self.model_id = model_id

	# Get API key from parameter or environment
	api_key = api_key or os.environ.get("VOYAGE_API_KEY")
	if not api_key:
	raise ValueError(
	"Voyage API key required. Set VOYAGE_API_KEY environment variable "
	"or pass api_key parameter."
	)

	self.client = voyageai.Client(api_key=api_key)

	# Get model config
	self.config = API_MODELS.get(model_id, {
	"name": model_id,
	"description": "Voyage AI embedding model",
	"type": "voyage",
	"model_name": model_id.replace("voyage/", ""),
	"dimensions": 1024, # Default dimension
	})

	self._model_name = self.config["model_name"]
	self.embedding_dim = self.config["dimensions"]

	print(f"Initialized Voyage AI embedding model: {self._model_name}")
	print(f"Embedding dimension: {self.embedding_dim}")

	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 128, # Voyage supports larger batches
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""
	Encode texts to embeddings using Voyage AI API.

	Args:
	texts: List of texts to encode
	is_query: Whether these are queries (Voyage supports input_type)
	batch_size: Batch size for API calls
	show_progress: Whether to show progress bar
	normalize: Whether to L2-normalize embeddings

	Returns:
	numpy array of shape (len(texts), embedding_dim)
	"""
	import time

	all_embeddings = []
	total_batches = (len(texts) + batch_size - 1) // batch_size

	# Voyage supports input_type for asymmetric embeddings
	input_type = "query" if is_query else "document"

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_num = i // batch_size + 1

	if show_progress:
	print(f" Encoding batch {batch_num}/{total_batches}...")

	# Retry logic for API calls
	max_retries = 3
	for attempt in range(max_retries):
	try:
	result = self.client.embed(
	batch,
	model=self._model_name,
	input_type=input_type,
	)

	# Extract embeddings from response
	batch_embeddings = result.embeddings
	all_embeddings.extend(batch_embeddings)
	break

	except Exception as e:
	if attempt < max_retries - 1:
	wait_time = 2 ** attempt
	print(f" API error, retrying in {wait_time}s: {e}")
	time.sleep(wait_time)
	else:
	raise RuntimeError(f"Voyage AI API error after {max_retries} retries: {e}")

	# Small delay to avoid rate limits
	if i + batch_size < len(texts):
	time.sleep(0.1)

	embeddings = np.array(all_embeddings, dtype=np.float32)

	# Normalize if requested
	if normalize:
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings = embeddings / np.maximum(norms, 1e-10)

	return embeddings

	@property
	def name(self) -> str:
	"""Get display name for the model."""
	return self.config.get("name", self.model_id)

	@property
	def description(self) -> str:
	"""Get description for the model."""
	return self.config.get("description", "")


	class GeminiEmbeddingModel(BaseEmbeddingModel):
	"""
	Wrapper for Google Gemini embedding API with consistent interface.
	"""

	def __init__(
	self,
	model_id: str,
	api_key: Optional[str] = None,
	):
	"""
	Initialize the Gemini embedding model.

	Args:
	model_id: Model ID in format 'gemini/model-name'
	api_key: Gemini API key (optional - can use GEMINI_API_KEY env var
	or Google Cloud Application Default Credentials)
	"""
	try:
	from google import genai
	except ImportError:
	raise ImportError(
	"Google GenAI package not installed. Install with: pip install google-genai"
	)

	self.model_id = model_id

	# Get API key from parameter or environment (optional - ADC also works)
	api_key = api_key or os.environ.get("GEMINI_API_KEY")

	# Create client - if no API key, will use Application Default Credentials
	if api_key:
	self.client = genai.Client(api_key=api_key)
	else:
	# Use Application Default Credentials (gcloud auth application-default login)
	self.client = genai.Client()

	# Get model config
	self.config = API_MODELS.get(model_id, {
	"name": model_id,
	"description": "Gemini embedding model",
	"type": "gemini",
	"model_name": model_id.replace("gemini/", "").split("-768")[0].split("-1536")[0],
	"dimensions": 3072, # Default dimension
	})

	self._model_name = self.config["model_name"]
	self.embedding_dim = self.config["dimensions"]

	print(f"Initialized Gemini embedding model: {self._model_name}")
	print(f"Embedding dimension: {self.embedding_dim}")

	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 20, # Smaller batches to avoid rate limits
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""
	Encode texts to embeddings using Gemini API.

	Args:
	texts: List of texts to encode
	is_query: Whether these are queries (uses RETRIEVAL_QUERY vs RETRIEVAL_DOCUMENT)
	batch_size: Batch size for API calls (smaller for Gemini to avoid rate limits)
	show_progress: Whether to show progress bar
	normalize: Whether to L2-normalize embeddings

	Returns:
	numpy array of shape (len(texts), embedding_dim)
	"""
	import time
	import random
	from google.genai import types

	all_embeddings = []
	total_batches = (len(texts) + batch_size - 1) // batch_size

	# Gemini supports task_type for asymmetric embeddings
	task_type = "RETRIEVAL_QUERY" if is_query else "RETRIEVAL_DOCUMENT"

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_num = i // batch_size + 1

	if show_progress:
	print(f" Encoding batch {batch_num}/{total_batches}...")

	# Retry logic with exponential backoff for rate limits
	max_retries = 8
	base_delay = 2.0

	for attempt in range(max_retries):
	try:
	# Build config with task type and output dimensionality
	embed_config = types.EmbedContentConfig(
	task_type=task_type,
	output_dimensionality=self.embedding_dim,
	)

	result = self.client.models.embed_content(
	model=self._model_name,
	contents=batch,
	config=embed_config,
	)

	# Extract embeddings from response
	batch_embeddings = [e.values for e in result.embeddings]
	all_embeddings.extend(batch_embeddings)
	break

	except Exception as e:
	error_str = str(e)
	is_rate_limit = "429" in error_str or "RESOURCE_EXHAUSTED" in error_str

	if attempt < max_retries - 1:
	# Exponential backoff with jitter
	# Longer waits for rate limit errors
	if is_rate_limit:
	wait_time = base_delay * (2 ** attempt) + random.uniform(1, 5)
	print(f" Rate limited, waiting {wait_time:.1f}s before retry {attempt + 2}/{max_retries}...")
	else:
	wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
	print(f" API error, retrying in {wait_time:.1f}s: {e}")
	time.sleep(wait_time)
	else:
	raise RuntimeError(f"Gemini API error after {max_retries} retries: {e}")

	# Delay between batches to avoid rate limits (longer for Gemini)
	if i + batch_size < len(texts):
	time.sleep(0.5)

	embeddings = np.array(all_embeddings, dtype=np.float32)

	# Normalize if requested (Gemini's 3072d is normalized, but smaller dims need it)
	if normalize:
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings = embeddings / np.maximum(norms, 1e-10)

	return embeddings

	@property
	def name(self) -> str:
	"""Get display name for the model."""
	return self.config.get("name", self.model_id)

	@property
	def description(self) -> str:
	"""Get description for the model."""
	return self.config.get("description", "")


	class CohereEmbeddingModel(BaseEmbeddingModel):
	"""
	Wrapper for Cohere embedding API with consistent interface.
	"""

	def __init__(
	self,
	model_id: str,
	api_key: Optional[str] = None,
	):
	"""
	Initialize the Cohere embedding model.

	Args:
	model_id: Model ID in format 'cohere/model-name'
	api_key: Cohere API key (or uses COHERE_API_KEY env var)
	"""
	try:
	import cohere
	except ImportError:
	raise ImportError(
	"Cohere package not installed. Install with: pip install cohere"
	)

	self.model_id = model_id

	# Get API key from parameter or environment
	api_key = api_key or os.environ.get("COHERE_API_KEY")
	if not api_key:
	raise ValueError(
	"Cohere API key required. Set COHERE_API_KEY environment variable "
	"or pass api_key parameter."
	)

	self.client = cohere.Client(api_key=api_key)

	# Get model config
	self.config = API_MODELS.get(model_id, {
	"name": model_id,
	"description": "Cohere embedding model",
	"type": "cohere",
	"model_name": model_id.replace("cohere/", ""),
	"dimensions": 1024, # Default dimension
	})

	self._model_name = self.config["model_name"]
	self.embedding_dim = self.config["dimensions"]

	print(f"Initialized Cohere embedding model: {self._model_name}")
	print(f"Embedding dimension: {self.embedding_dim}")

	def encode(
	self,
	texts: list[str],
	is_query: bool = False,
	batch_size: int = 96, # Cohere supports up to 96 texts per request
	show_progress: bool = True,
	normalize: bool = True,
	) -> np.ndarray:
	"""
	Encode texts to embeddings using Cohere API.

	Args:
	texts: List of texts to encode
	is_query: Whether these are queries (uses search_query vs search_document)
	batch_size: Batch size for API calls
	show_progress: Whether to show progress bar
	normalize: Whether to L2-normalize embeddings

	Returns:
	numpy array of shape (len(texts), embedding_dim)
	"""
	import time

	all_embeddings = []
	total_batches = (len(texts) + batch_size - 1) // batch_size

	# Cohere v3 models require input_type for asymmetric embeddings
	input_type = "search_query" if is_query else "search_document"

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	batch_num = i // batch_size + 1

	if show_progress:
	print(f" Encoding batch {batch_num}/{total_batches}...")

	# Retry logic for API calls
	max_retries = 3
	for attempt in range(max_retries):
	try:
	result = self.client.embed(
	texts=batch,
	model=self._model_name,
	input_type=input_type,
	)

	# Extract embeddings from response
	batch_embeddings = result.embeddings
	all_embeddings.extend(batch_embeddings)
	break

	except Exception as e:
	if attempt < max_retries - 1:
	wait_time = 2 ** attempt
	print(f" API error, retrying in {wait_time}s: {e}")
	time.sleep(wait_time)
	else:
	raise RuntimeError(f"Cohere API error after {max_retries} retries: {e}")

	# Small delay to avoid rate limits
	if i + batch_size < len(texts):
	time.sleep(0.1)

	embeddings = np.array(all_embeddings, dtype=np.float32)

	# Normalize if requested
	if normalize:
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings = embeddings / np.maximum(norms, 1e-10)

	return embeddings

	@property
	def name(self) -> str:
	"""Get display name for the model."""
	return self.config.get("name", self.model_id)

	@property
	def description(self) -> str:
	"""Get description for the model."""
	return self.config.get("description", "")


	def get_curated_model_choices() -> list[tuple[str, str]]:
	"""
	Get list of curated local models for UI dropdown.

	Returns:
	List of (model_id, display_name) tuples
	"""
	return [
	(model_id, f"{info['name']} - {info['description']}")
	for model_id, info in CURATED_MODELS.items()
	]


	def get_api_model_choices() -> list[tuple[str, str]]:
	"""
	Get list of API-based models for UI dropdown.

	Returns:
	List of (model_id, display_name) tuples
	"""
	return [
	(model_id, f"{info['name']} - {info['description']}")
	for model_id, info in API_MODELS.items()
	]


	def get_all_model_choices() -> list[tuple[str, str]]:
	"""
	Get list of all models (local + API) for UI dropdown.

	Returns:
	List of (model_id, display_name) tuples
	"""
	return get_curated_model_choices() + get_api_model_choices()


	def is_api_model(model_id: str) -> bool:
	"""Check if a model ID is an API-based model."""
	model_id = model_id.strip()

	# Check if it's in API_MODELS
	if model_id in API_MODELS:
	return True

	# Check if it starts with known API prefixes
	if model_id.startswith("openai/"):
	return True
	if model_id.startswith("voyage/"):
	return True
	if model_id.startswith("gemini/"):
	return True
	if model_id.startswith("cohere/"):
	return True

	return False


	def load_model(
	model_id: str,
	device: Optional[str] = None,
	api_key: Optional[str] = None,
	hf_token: Optional[str] = None,
	) -> BaseEmbeddingModel:
	"""
	Load an embedding model by ID.

	Args:
	model_id: Model ID (HuggingFace model ID or API model like 'openai/text-embedding-3-large')
	device: Device to use (for local models only)
	api_key: API key (for API-based models, or uses environment variable)
	hf_token: HuggingFace token for gated local models (or uses HF_TOKEN env var)

	Returns:
	Loaded embedding model instance
	"""
	model_id = model_id.strip()

	# Check if this is an API model
	if is_api_model(model_id):
	# Check model type from config or prefix
	model_config = API_MODELS.get(model_id, {})
	model_type = model_config.get("type", "")

	if model_type == "voyage" or model_id.startswith("voyage/"):
	return VoyageEmbeddingModel(model_id, api_key=api_key)
	elif model_type == "gemini" or model_id.startswith("gemini/"):
	return GeminiEmbeddingModel(model_id, api_key=api_key)
	elif model_type == "cohere" or model_id.startswith("cohere/"):
	return CohereEmbeddingModel(model_id, api_key=api_key)
	elif model_type == "openai" or model_id.startswith("openai/"):
	return OpenAIEmbeddingModel(model_id, api_key=api_key)
	else:
	raise ValueError(f"Unknown API model type: {model_id}")

	# Otherwise, load as a local sentence-transformer model
	return EmbeddingModel(model_id, device=device, hf_token=hf_token)


	def validate_model_id(model_id: str) -> tuple[bool, str]:
	"""
	Check if a model ID is valid and loadable.

	Args:
	model_id: The model ID to validate

	Returns:
	Tuple of (is_valid, error_message)
	"""
	if not model_id or not model_id.strip():
	return False, "Model ID cannot be empty"

	model_id = model_id.strip()

	# Check if it's a curated local model
	if model_id in CURATED_MODELS:
	return True, ""

	# Check if it's a known API model
	if model_id in API_MODELS:
	return True, ""

	# Check for OpenAI models
	if model_id.startswith("openai/"):
	return True, ""

	# Check for Voyage AI models
	if model_id.startswith("voyage/"):
	return True, ""

	# Check for Gemini models
	if model_id.startswith("gemini/"):
	return True, ""

	# Check for Cohere models
	if model_id.startswith("cohere/"):
	return True, ""

	# For custom models, check if it looks like a valid HF model ID
	if "/" not in model_id:
	return False, "Model ID should be in format 'organization/model-name'"

	# Could add an API check here, but that would slow down validation
	return True, ""


	def requires_api_key(model_id: str) -> bool:
	"""Check if a model requires an API key."""
	return is_api_model(model_id)


	def api_key_optional(model_id: str) -> bool:
	"""
	Check if an API key is optional for this model.

	Some providers (like Google Gemini) support Application Default Credentials
	as an alternative to explicit API keys.
	"""
	key_type = get_api_key_type(model_id)
	# Gemini supports ADC (gcloud auth application-default login)
	return key_type == "gemini"


	def get_api_key_type(model_id: str) -> Optional[str]:
	"""
	Get the type of API key required for a model.

	Args:
	model_id: The model ID

	Returns:
	'openai', 'voyage', or None if no API key needed
	"""
	if not is_api_model(model_id):
	return None

	model_id = model_id.strip()
	model_config = API_MODELS.get(model_id, {})
	model_type = model_config.get("type", "")

	if model_type == "voyage" or model_id.startswith("voyage/"):
	return "voyage"
	elif model_type == "gemini" or model_id.startswith("gemini/"):
	return "gemini"
	elif model_type == "cohere" or model_id.startswith("cohere/"):
	return "cohere"
	elif model_type == "openai" or model_id.startswith("openai/"):
	return "openai"

	return None


	def get_api_key_env_var(model_id: str) -> Optional[str]:
	"""
	Get the environment variable name for the API key required by a model.

	Args:
	model_id: The model ID

	Returns:
	Environment variable name or None
	"""
	key_type = get_api_key_type(model_id)
	if key_type == "openai":
	return "OPENAI_API_KEY"
	elif key_type == "voyage":
	return "VOYAGE_API_KEY"
	elif key_type == "gemini":
	return "GEMINI_API_KEY"
	elif key_type == "cohere":
	return "COHERE_API_KEY"
	return None


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
	description="Test embedding model loading and encoding"
	)
	parser.add_argument(
	"--local",
	action="store_true",
	help="Test only local sentence-transformer models",
	)
	parser.add_argument(
	"--remote",
	action="store_true",
	help="Test only remote/API models (requires API keys)",
	)
	parser.add_argument(
	"--model",
	type=str,
	default=None,
	help="Test a specific model ID",
	)

	args = parser.parse_args()

	# If neither flag specified, test both
	test_local = args.local or (not args.local and not args.remote)
	test_remote = args.remote or (not args.local and not args.remote)

	print("Testing model loading...")

	print(f"\nLocal models available:")
	for model_id, display in get_curated_model_choices():
	print(f" - {display}")

	print(f"\nAPI models available:")
	for model_id, display in get_api_model_choices():
	print(f" - {display}")

	# Test texts
	test_texts = [
	"בראשית ברא אלהים את השמים ואת הארץ",
	"In the beginning God created the heaven and the earth",
	]

	def run_model_test(model_id: str, model_type: str):
	"""Run a test for a specific model."""
	print(f"\n{'='*60}")
	print(f"Testing {model_type}: {model_id}")
	print("="*60)

	try:
	model = load_model(model_id)

	embeddings = model.encode(test_texts, show_progress=False)
	print(f"\nEncoded {len(test_texts)} texts")
	print(f"Embedding shape: {embeddings.shape}")

	similarity = np.dot(embeddings[0], embeddings[1])
	print(f"Cosine similarity between Hebrew and English: {similarity:.4f}")
	return True
	except Exception as e:
	print(f"Test failed: {e}")
	return False

	# Test specific model if provided
	if args.model:
	run_model_test(args.model, "specified model")
	else:
	# Test local model
	if test_local:
	run_model_test(
	"sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
	"local sentence-transformer model"
	)

	# Test API models
	if test_remote:
	# Test OpenAI model
	if os.environ.get("OPENAI_API_KEY"):
	run_model_test(
	"openai/text-embedding-3-small",
	"OpenAI API model"
	)
	else:
	print("\n(Skipping OpenAI test - OPENAI_API_KEY not set)")

	# Test Voyage AI model
	if os.environ.get("VOYAGE_API_KEY"):
	run_model_test(
	"voyage/voyage-3.5",
	"Voyage AI API model"
	)
	else:
	print("\n(Skipping Voyage AI test - VOYAGE_API_KEY not set)")

	# Test Gemini model
	if os.environ.get("GEMINI_API_KEY"):
	run_model_test(
	"gemini/gemini-embedding-001",
	"Gemini API model"
	)
	else:
	print("\n(Skipping Gemini test - GEMINI_API_KEY not set)")