Spaces:

Sefaria
/

Rabbinic-Embedding-Bench

Sleeping

App Files Files Community

Lev Israel commited on Jan 14

Commit

1787d7f

1 Parent(s): e962d38

Support Cohere

Browse files

Files changed (4) hide show

.claude/settings.local.json +11 -1
app.py +3 -0
models.py +163 -3
requirements.txt +1 -0

.claude/settings.local.json CHANGED Viewed

@@ -2,7 +2,17 @@
   "permissions": {
     "allow": [
       "Bash(python -m py_compile:*)",
-      "Bash(python:*)"
     ]
   }
 }

   "permissions": {
     "allow": [
       "Bash(python -m py_compile:*)",
+      "Bash(python:*)",
+      "Bash(grep:*)",
+      "Bash(wc:*)",
+      "WebSearch",
+      "WebFetch(domain:docs.cohere.com)",
+      "WebFetch(domain:github.com)",
+      "WebFetch(domain:pypi.org)",
+      "WebFetch(domain:qdrant.tech)",
+      "WebFetch(domain:zilliz.com)",
+      "WebFetch(domain:docs.pinecone.io)",
+      "WebFetch(domain:cohere.com)"
     ]
   }
 }

app.py CHANGED Viewed

@@ -379,6 +379,9 @@ def update_model_inputs_visibility(choice):
         elif key_type == "gemini":
             label = "Gemini API Key (optional if using gcloud)"
             placeholder = f"Leave blank if using gcloud ADC, or enter API key / set {env_var}"
         else:
             label = "OpenAI API Key"
             placeholder = f"Enter your OpenAI API key (or set {env_var} env var)"

         elif key_type == "gemini":
             label = "Gemini API Key (optional if using gcloud)"
             placeholder = f"Leave blank if using gcloud ADC, or enter API key / set {env_var}"
+        elif key_type == "cohere":
+            label = "Cohere API Key"
+            placeholder = f"Enter your Cohere API key (or set {env_var} env var)"
         else:
             label = "OpenAI API Key"
             placeholder = f"Enter your OpenAI API key (or set {env_var} env var)"

models.py CHANGED Viewed

@@ -137,6 +137,20 @@ API_MODELS = {
         "model_name": "gemini-embedding-001",
         "dimensions": 1536,
     },
 }
 # Merge all models for easy lookup
@@ -771,6 +785,140 @@ class GeminiEmbeddingModel(BaseEmbeddingModel):
         return self.config.get("description", "")
 def get_curated_model_choices() -> list[tuple[str, str]]:
     """
     Get list of curated local models for UI dropdown.
@@ -822,7 +970,9 @@ def is_api_model(model_id: str) -> bool:
         return True
     if model_id.startswith("gemini/"):
         return True
     return False
@@ -854,6 +1004,8 @@ def load_model(
             return VoyageEmbeddingModel(model_id, api_key=api_key)
         elif model_type == "gemini" or model_id.startswith("gemini/"):
             return GeminiEmbeddingModel(model_id, api_key=api_key)
         elif model_type == "openai" or model_id.startswith("openai/"):
             return OpenAIEmbeddingModel(model_id, api_key=api_key)
         else:
@@ -897,7 +1049,11 @@ def validate_model_id(model_id: str) -> tuple[bool, str]:
     # Check for Gemini models
     if model_id.startswith("gemini/"):
         return True, ""
     # For custom models, check if it looks like a valid HF model ID
     if "/" not in model_id:
         return False, "Model ID should be in format 'organization/model-name'"
@@ -944,9 +1100,11 @@ def get_api_key_type(model_id: str) -> Optional[str]:
         return "voyage"
     elif model_type == "gemini" or model_id.startswith("gemini/"):
         return "gemini"
     elif model_type == "openai" or model_id.startswith("openai/"):
         return "openai"
     return None
@@ -967,6 +1125,8 @@ def get_api_key_env_var(model_id: str) -> Optional[str]:
         return "VOYAGE_API_KEY"
     elif key_type == "gemini":
         return "GEMINI_API_KEY"
     return None

         "model_name": "gemini-embedding-001",
         "dimensions": 1536,
     },
+    "cohere/embed-multilingual-v3.0": {
+        "name": "Cohere embed-multilingual-v3.0",
+        "description": "Cohere's multilingual embedding model, 100+ languages (API key required)",
+        "type": "cohere",
+        "model_name": "embed-multilingual-v3.0",
+        "dimensions": 1024,
+    },
+    "cohere/embed-multilingual-light-v3.0": {
+        "name": "Cohere embed-multilingual-light-v3.0",
+        "description": "Cohere's lightweight multilingual model (API key required)",
+        "type": "cohere",
+        "model_name": "embed-multilingual-light-v3.0",
+        "dimensions": 384,
+    },
 }
 # Merge all models for easy lookup
         return self.config.get("description", "")
+class CohereEmbeddingModel(BaseEmbeddingModel):
+    """
+    Wrapper for Cohere embedding API with consistent interface.
+    """
+    def __init__(
+        self,
+        model_id: str,
+        api_key: Optional[str] = None,
+    ):
+        """
+        Initialize the Cohere embedding model.
+        Args:
+            model_id: Model ID in format 'cohere/model-name'
+            api_key: Cohere API key (or uses COHERE_API_KEY env var)
+        """
+        try:
+            import cohere
+        except ImportError:
+            raise ImportError(
+                "Cohere package not installed. Install with: pip install cohere"
+            )
+        self.model_id = model_id
+        # Get API key from parameter or environment
+        api_key = api_key or os.environ.get("COHERE_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Cohere API key required. Set COHERE_API_KEY environment variable "
+                "or pass api_key parameter."
+            )
+        self.client = cohere.Client(api_key=api_key)
+        # Get model config
+        self.config = API_MODELS.get(model_id, {
+            "name": model_id,
+            "description": "Cohere embedding model",
+            "type": "cohere",
+            "model_name": model_id.replace("cohere/", ""),
+            "dimensions": 1024,  # Default dimension
+        })
+        self._model_name = self.config["model_name"]
+        self.embedding_dim = self.config["dimensions"]
+        print(f"Initialized Cohere embedding model: {self._model_name}")
+        print(f"Embedding dimension: {self.embedding_dim}")
+    def encode(
+        self,
+        texts: list[str],
+        is_query: bool = False,
+        batch_size: int = 96,  # Cohere supports up to 96 texts per request
+        show_progress: bool = True,
+        normalize: bool = True,
+    ) -> np.ndarray:
+        """
+        Encode texts to embeddings using Cohere API.
+        Args:
+            texts: List of texts to encode
+            is_query: Whether these are queries (uses search_query vs search_document)
+            batch_size: Batch size for API calls
+            show_progress: Whether to show progress bar
+            normalize: Whether to L2-normalize embeddings
+        Returns:
+            numpy array of shape (len(texts), embedding_dim)
+        """
+        import time
+        all_embeddings = []
+        total_batches = (len(texts) + batch_size - 1) // batch_size
+        # Cohere v3 models require input_type for asymmetric embeddings
+        input_type = "search_query" if is_query else "search_document"
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            batch_num = i // batch_size + 1
+            if show_progress:
+                print(f"  Encoding batch {batch_num}/{total_batches}...")
+            # Retry logic for API calls
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    result = self.client.embed(
+                        texts=batch,
+                        model=self._model_name,
+                        input_type=input_type,
+                    )
+                    # Extract embeddings from response
+                    batch_embeddings = result.embeddings
+                    all_embeddings.extend(batch_embeddings)
+                    break
+                except Exception as e:
+                    if attempt < max_retries - 1:
+                        wait_time = 2 ** attempt
+                        print(f"  API error, retrying in {wait_time}s: {e}")
+                        time.sleep(wait_time)
+                    else:
+                        raise RuntimeError(f"Cohere API error after {max_retries} retries: {e}")
+            # Small delay to avoid rate limits
+            if i + batch_size < len(texts):
+                time.sleep(0.1)
+        embeddings = np.array(all_embeddings, dtype=np.float32)
+        # Normalize if requested
+        if normalize:
+            norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+            embeddings = embeddings / np.maximum(norms, 1e-10)
+        return embeddings
+    @property
+    def name(self) -> str:
+        """Get display name for the model."""
+        return self.config.get("name", self.model_id)
+    @property
+    def description(self) -> str:
+        """Get description for the model."""
+        return self.config.get("description", "")
 def get_curated_model_choices() -> list[tuple[str, str]]:
     """
     Get list of curated local models for UI dropdown.
         return True
     if model_id.startswith("gemini/"):
         return True
+    if model_id.startswith("cohere/"):
+        return True
     return False
             return VoyageEmbeddingModel(model_id, api_key=api_key)
         elif model_type == "gemini" or model_id.startswith("gemini/"):
             return GeminiEmbeddingModel(model_id, api_key=api_key)
+        elif model_type == "cohere" or model_id.startswith("cohere/"):
+            return CohereEmbeddingModel(model_id, api_key=api_key)
         elif model_type == "openai" or model_id.startswith("openai/"):
             return OpenAIEmbeddingModel(model_id, api_key=api_key)
         else:
     # Check for Gemini models
     if model_id.startswith("gemini/"):
         return True, ""
+    # Check for Cohere models
+    if model_id.startswith("cohere/"):
+        return True, ""
     # For custom models, check if it looks like a valid HF model ID
     if "/" not in model_id:
         return False, "Model ID should be in format 'organization/model-name'"
         return "voyage"
     elif model_type == "gemini" or model_id.startswith("gemini/"):
         return "gemini"
+    elif model_type == "cohere" or model_id.startswith("cohere/"):
+        return "cohere"
     elif model_type == "openai" or model_id.startswith("openai/"):
         return "openai"
     return None
         return "VOYAGE_API_KEY"
     elif key_type == "gemini":
         return "GEMINI_API_KEY"
+    elif key_type == "cohere":
+        return "COHERE_API_KEY"
     return None

requirements.txt CHANGED Viewed

@@ -19,4 +19,5 @@ openai>=1.0.0
 tiktoken>=0.5.0
 voyageai>=0.3.0
 google-genai>=1.0.0

 tiktoken>=0.5.0
 voyageai>=0.3.0
 google-genai>=1.0.0
+cohere>=5.0.0