Spaces:
Sleeping
Sleeping
File size: 11,816 Bytes
f133a92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
"""Embedding utilities for QualiVec."""
import numpy as np
import pandas as pd
from typing import List, Dict, Any, Optional, Union
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import os
import time
class EmbeddingModel:
"""Handles text embedding for QualiVec."""
def __init__(self,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
model_type: str = "huggingface",
device: Optional[str] = None,
cache_dir: Optional[str] = None,
verbose: bool = True):
"""Initialize the embedding model.
Args:
model_name: Name of the model to use (HuggingFace model or Gemini model).
model_type: Type of model ('huggingface' or 'gemini').
device: Device to use for computation ('cpu' or 'cuda'). Only for HuggingFace models.
cache_dir: Directory to cache models. Only for HuggingFace models.
verbose: Whether to print status messages.
"""
self.model_name = model_name
self.model_type = model_type.lower()
self.verbose = verbose
self.cache_dir = cache_dir
if self.model_type not in ["huggingface", "gemini"]:
raise ValueError(f"model_type must be 'huggingface' or 'gemini', got '{model_type}'")
if self.model_type == "huggingface":
# Determine device
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
if self.verbose:
print(f"Using device: {self.device}")
print(f"Loading HuggingFace model: {model_name}")
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
self.model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir).to(self.device)
if self.verbose:
print(f"HuggingFace model loaded successfully")
elif self.model_type == "gemini":
if self.verbose:
print(f"Initializing Gemini model: {model_name}")
# Import Gemini client
try:
from google import genai
# Get API key from environment variable
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
raise ValueError(
"GOOGLE_API_KEY environment variable not set. "
"Please set it with your Gemini API key."
)
self.genai_client = genai.Client(api_key="API_KEY")
if self.verbose:
print(f"Gemini client initialized successfully")
print(f"⚠️ Free tier limits: 1,500 requests/day, 100 texts per batch")
except ImportError:
raise ImportError("google-genai library is required for Gemini models. Install with: pip install google-genai")
def _mean_pooling(self, model_output, attention_mask):
"""Mean pooling operation to get sentence embeddings."""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def embed_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
"""Generate embeddings for a list of texts.
Args:
texts: List of texts to embed.
batch_size: Batch size for processing.
Returns:
Numpy array of embeddings.
"""
if self.verbose:
print(f"Generating embeddings for {len(texts)} texts")
if self.model_type == "huggingface":
return self._embed_texts_huggingface(texts, batch_size)
elif self.model_type == "gemini":
return self._embed_texts_gemini(texts, batch_size)
else:
raise ValueError(f"Unsupported model_type: {self.model_type}")
def _embed_texts_huggingface(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
"""Generate embeddings using HuggingFace model.
Args:
texts: List of texts to embed.
batch_size: Batch size for processing.
Returns:
Numpy array of embeddings.
"""
embeddings = []
# Process in batches
for i in tqdm(range(0, len(texts), batch_size), disable=not self.verbose):
batch_texts = texts[i:i + batch_size]
# Tokenize
encoded_input = self.tokenizer(batch_texts, padding=True, truncation=True,
max_length=512, return_tensors='pt').to(self.device)
# Get model output
with torch.no_grad():
model_output = self.model(**encoded_input)
# Mean pooling
batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
# Normalize embeddings
batch_embeddings = torch.nn.functional.normalize(batch_embeddings, p=2, dim=1)
# Add to list
embeddings.append(batch_embeddings.cpu().numpy())
# Concatenate all batches
all_embeddings = np.vstack(embeddings)
if self.verbose:
print(f"Generated embeddings with shape: {all_embeddings.shape}")
return all_embeddings
def _embed_texts_gemini(self, texts: List[str], batch_size: int = 100) -> np.ndarray:
"""Generate embeddings using Gemini model with rate limiting.
Args:
texts: List of texts to embed.
batch_size: Batch size for processing (reduced to 100 to respect rate limits).
Returns:
Numpy array of embeddings.
"""
embeddings = []
# Process in batches with rate limiting
for i in tqdm(range(0, len(texts), batch_size), disable=not self.verbose):
batch_texts = texts[i:i + batch_size]
# Retry logic with exponential backoff
max_retries = 3
retry_delay = 2 # seconds
for attempt in range(max_retries):
try:
# Get embeddings from Gemini
result = self.genai_client.models.embed_content(
model=self.model_name,
contents=batch_texts # type: ignore
)
# Extract embeddings
if result.embeddings:
batch_embeddings = [emb.values for emb in result.embeddings]
embeddings.extend(batch_embeddings)
# Add delay between batches to respect rate limits (free tier: 1500 requests/day)
# With 100 texts per batch and ~60 second delay, we can process ~1440 texts/day
if i + batch_size < len(texts):
time.sleep(1) # 1 second delay between batches
break # Success, exit retry loop
except Exception as e:
error_msg = str(e)
if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg:
if attempt < max_retries - 1:
if self.verbose:
print(f"\nRate limit hit. Waiting {retry_delay} seconds before retry {attempt + 1}/{max_retries}...")
time.sleep(retry_delay)
retry_delay *= 2 # Exponential backoff
else:
raise Exception(
f"Gemini API quota exceeded. Free tier limits: 1500 requests/day.\n"
f"Error: {error_msg}\n\n"
f"Solutions:\n"
f"1. Wait and try again later (quota resets daily)\n"
f"2. Reduce the amount of data being processed\n"
f"3. Upgrade to a paid API plan\n"
f"4. Use HuggingFace models instead (no API limits)"
)
else:
raise # Re-raise non-quota errors
# Convert to numpy array
all_embeddings = np.array(embeddings)
if self.verbose:
print(f"Generated embeddings with shape: {all_embeddings.shape}")
return all_embeddings
def embed_dataframe(self,
df: pd.DataFrame,
text_column: str,
batch_size: int = 32) -> np.ndarray:
"""Generate embeddings for texts in a DataFrame column.
Args:
df: DataFrame containing texts.
text_column: Name of the column containing texts.
batch_size: Batch size for processing.
Returns:
Numpy array of embeddings.
"""
if text_column not in df.columns:
raise ValueError(f"Column '{text_column}' not found in DataFrame.")
texts = df[text_column].fillna("").tolist()
return self.embed_texts(texts, batch_size)
def embed_reference_vectors(self,
df: pd.DataFrame,
class_column: str = "class",
node_column: str = "matching_node",
batch_size: int = 32) -> Dict[str, Any]:
"""Generate embeddings for reference vectors.
Args:
df: DataFrame containing reference vectors.
class_column: Name of the column containing class labels.
node_column: Name of the column containing matching nodes.
batch_size: Batch size for processing.
Returns:
Dictionary with class info and embeddings.
"""
required_columns = [class_column, node_column]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Required columns {missing_columns} not found in DataFrame.")
# Get texts and generate embeddings
texts = df[node_column].fillna("").tolist()
embeddings = self.embed_texts(texts, batch_size)
# Create result dictionary
result = {
"classes": df[class_column].tolist(),
"nodes": df[node_column].tolist(),
"embeddings": embeddings,
"class_to_idx": {cls: i for i, cls in enumerate(df[class_column].unique())}
}
if self.verbose:
print(f"Generated embeddings for {len(result['classes'])} reference vectors")
print(f"Unique classes: {len(result['class_to_idx'])}")
return result
|