Spaces:

Sefaria
/

Rabbinic-Embedding-Bench

Running

App Files Files Community

Lev Israel commited on 5 days ago

Commit

d1c390a

1 Parent(s): 1787d7f

Embedding Gemma

Browse files

Files changed (1) hide show

models.py +28 -5

models.py CHANGED Viewed

@@ -56,6 +56,14 @@ CURATED_MODELS = {
         "query_prefix": "",
         "passage_prefix": "",
     },
 }
 # API-based models
@@ -233,6 +241,7 @@ class EmbeddingModel(BaseEmbeddingModel):
         model_id: str,
         device: Optional[str] = None,
         max_length: int = 512,
     ):
         """
         Initialize the embedding model.
@@ -241,12 +250,12 @@ class EmbeddingModel(BaseEmbeddingModel):
             model_id: Hugging Face model ID
             device: Device to use ('cuda', 'cpu', or None for auto)
             max_length: Maximum sequence length for tokenization
         """
         from sentence_transformers import SentenceTransformer
         import torch
         self.model_id = model_id
-        self.max_length = max_length
         # Auto-detect device
         if device is None:
@@ -262,12 +271,18 @@ class EmbeddingModel(BaseEmbeddingModel):
             "passage_prefix": "",
         })
         # Load the model with float16 on CUDA to save VRAM
         # (12B model: float32 = 48GB, float16 = 24GB)
         print(f"Loading model: {model_id} on {device}")
         # Only trust remote code from known publishers (security measure)
-        trusted_publishers = ["nvidia/"]
         trust_remote_code = any(model_id.startswith(pub) for pub in trusted_publishers)
         if device == "cuda":
@@ -276,13 +291,19 @@ class EmbeddingModel(BaseEmbeddingModel):
                 device=device,
                 model_kwargs={"torch_dtype": torch.float16},
                 trust_remote_code=trust_remote_code,
             )
         else:
-            self.model = SentenceTransformer(model_id, device=device, trust_remote_code=trust_remote_code)
         # Set max sequence length if supported
         if hasattr(self.model, "max_seq_length"):
-            self.model.max_seq_length = min(max_length, self.model.max_seq_length)
         self.embedding_dim = self.model.get_sentence_embedding_dimension()
         print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
@@ -980,6 +1001,7 @@ def load_model(
     model_id: str,
     device: Optional[str] = None,
     api_key: Optional[str] = None,
 ) -> BaseEmbeddingModel:
     """
     Load an embedding model by ID.
@@ -988,6 +1010,7 @@ def load_model(
         model_id: Model ID (HuggingFace model ID or API model like 'openai/text-embedding-3-large')
         device: Device to use (for local models only)
         api_key: API key (for API-based models, or uses environment variable)
     Returns:
         Loaded embedding model instance
@@ -1012,7 +1035,7 @@ def load_model(
             raise ValueError(f"Unknown API model type: {model_id}")
     # Otherwise, load as a local sentence-transformer model
-    return EmbeddingModel(model_id, device=device)
 def validate_model_id(model_id: str) -> tuple[bool, str]:

         "query_prefix": "",
         "passage_prefix": "",
     },
+    "google/embeddinggemma-300m": {
+        "name": "EmbeddingGemma",
+        "description": "Google's 300M param embedding model, 100+ languages, 768d (requires HF token + license)",
+        "type": "local",
+        "query_prefix": "task: search result | query: ",
+        "passage_prefix": "title: none | text: ",
+        "max_length": 2048,
+    },
 }
 # API-based models
         model_id: str,
         device: Optional[str] = None,
         max_length: int = 512,
+        hf_token: Optional[str] = None,
     ):
         """
         Initialize the embedding model.
             model_id: Hugging Face model ID
             device: Device to use ('cuda', 'cpu', or None for auto)
             max_length: Maximum sequence length for tokenization
+            hf_token: HuggingFace token for gated models (or uses HF_TOKEN env var)
         """
         from sentence_transformers import SentenceTransformer
         import torch
         self.model_id = model_id
         # Auto-detect device
         if device is None:
             "passage_prefix": "",
         })
+        # Use config max_length if available, otherwise use parameter
+        self.max_length = self.config.get("max_length", max_length)
+        # Get HF token from parameter or environment (for gated models like EmbeddingGemma)
+        hf_token = hf_token or os.environ.get("HF_TOKEN")
         # Load the model with float16 on CUDA to save VRAM
         # (12B model: float32 = 48GB, float16 = 24GB)
         print(f"Loading model: {model_id} on {device}")
         # Only trust remote code from known publishers (security measure)
+        trusted_publishers = ["nvidia/", "google/"]
         trust_remote_code = any(model_id.startswith(pub) for pub in trusted_publishers)
         if device == "cuda":
                 device=device,
                 model_kwargs={"torch_dtype": torch.float16},
                 trust_remote_code=trust_remote_code,
+                token=hf_token,
             )
         else:
+            self.model = SentenceTransformer(
+                model_id,
+                device=device,
+                trust_remote_code=trust_remote_code,
+                token=hf_token,
+            )
         # Set max sequence length if supported
         if hasattr(self.model, "max_seq_length"):
+            self.model.max_seq_length = min(self.max_length, self.model.max_seq_length)
         self.embedding_dim = self.model.get_sentence_embedding_dimension()
         print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
     model_id: str,
     device: Optional[str] = None,
     api_key: Optional[str] = None,
+    hf_token: Optional[str] = None,
 ) -> BaseEmbeddingModel:
     """
     Load an embedding model by ID.
         model_id: Model ID (HuggingFace model ID or API model like 'openai/text-embedding-3-large')
         device: Device to use (for local models only)
         api_key: API key (for API-based models, or uses environment variable)
+        hf_token: HuggingFace token for gated local models (or uses HF_TOKEN env var)
     Returns:
         Loaded embedding model instance
             raise ValueError(f"Unknown API model type: {model_id}")
     # Otherwise, load as a local sentence-transformer model
+    return EmbeddingModel(model_id, device=device, hf_token=hf_token)
 def validate_model_id(model_id: str) -> tuple[bool, str]: