nivakaran's picture
Upload folder using huggingface_hub
16ec2cf verified
"""
models/anomaly-detection/src/utils/vectorizer.py
Text vectorization using language-specific BERT models (downloaded locally)
"""
import os
import logging
from typing import Dict, List, Optional, Tuple
from pathlib import Path
import numpy as np
logger = logging.getLogger("vectorizer")
# Transformers
try:
from transformers import AutoTokenizer, AutoModel
import torch
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
logger.warning("Transformers not available. Install with: pip install transformers torch")
# Sentence Transformers for fallback
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
class MultilingualVectorizer:
"""
Vectorizer using language-specific BERT models.
Downloads and caches models locally from HuggingFace.
Models:
- English: distilbert-base-uncased (fast, accurate)
- Sinhala: keshan/SinhalaBERTo (specialized)
- Tamil: l3cube-pune/tamil-bert (specialized)
"""
MODEL_MAP = {
"english": "distilbert-base-uncased",
"sinhala": "keshan/SinhalaBERTo",
"tamil": "l3cube-pune/tamil-bert"
}
def __init__(self, models_cache_dir: Optional[str] = None, device: Optional[str] = None):
"""
Initialize the multilingual vectorizer.
Args:
models_cache_dir: Directory to cache downloaded models
device: 'cuda' or 'cpu' (auto-detected if None)
"""
self.models_cache_dir = models_cache_dir or str(
Path(__file__).parent.parent.parent / "models_cache"
)
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True)
# Set cache dir for HuggingFace
os.environ["TRANSFORMERS_CACHE"] = self.models_cache_dir
os.environ["HF_HOME"] = self.models_cache_dir
# Auto-detect device
if device is None:
if TRANSFORMERS_AVAILABLE and torch.cuda.is_available():
self.device = "cuda"
else:
self.device = "cpu"
else:
self.device = device
logger.info(f"[Vectorizer] Using device: {self.device}")
# Lazy load models
self.models: Dict[str, Tuple] = {} # {lang: (tokenizer, model)}
self.fallback_model = None
def _load_model(self, language: str) -> Tuple:
"""
Load language-specific model from cache or download.
Returns:
Tuple of (tokenizer, model)
"""
if language in self.models:
return self.models[language]
model_name = self.MODEL_MAP.get(language, self.MODEL_MAP["english"])
if not TRANSFORMERS_AVAILABLE:
raise RuntimeError("Transformers library not available")
logger.info(f"[Vectorizer] Loading model: {model_name}")
try:
tokenizer = AutoTokenizer.from_pretrained(
model_name,
cache_dir=self.models_cache_dir
)
model = AutoModel.from_pretrained(
model_name,
cache_dir=self.models_cache_dir
).to(self.device)
model.eval()
self.models[language] = (tokenizer, model)
logger.info(f"[Vectorizer] ✓ Loaded {model_name} ({language})")
return tokenizer, model
except Exception as e:
logger.error(f"[Vectorizer] Failed to load {model_name}: {e}")
# Fallback to English model
if language != "english":
logger.info("[Vectorizer] Falling back to English model")
return self._load_model("english")
raise
def _get_embedding(self, text: str, tokenizer, model) -> np.ndarray:
"""
Get embedding vector using mean pooling.
Args:
text: Input text
tokenizer: HuggingFace tokenizer
model: HuggingFace model
Returns:
768-dim numpy array
"""
if not TRANSFORMERS_AVAILABLE:
raise RuntimeError("Transformers not available")
# Tokenize
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512,
padding=True
).to(self.device)
# Get embeddings
with torch.no_grad():
outputs = model(**inputs)
# Mean pooling over sequence length
attention_mask = inputs["attention_mask"]
hidden_states = outputs.last_hidden_state
# Mask and average
mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
mean_embedding = sum_embeddings / sum_mask
return mean_embedding.cpu().numpy().flatten()
def vectorize(self, text: str, language: str = "english") -> np.ndarray:
"""
Convert text to vector embedding.
Args:
text: Input text
language: 'english', 'sinhala', 'tamil', or 'unknown'
Returns:
768-dim numpy array
"""
if not text or not text.strip():
return np.zeros(768)
# Map unknown to english
if language == "unknown":
language = "english"
try:
tokenizer, model = self._load_model(language)
return self._get_embedding(text, tokenizer, model)
except Exception as e:
logger.error(f"[Vectorizer] Error vectorizing: {e}")
# Return zeros as fallback
return np.zeros(768)
def vectorize_batch(
self,
texts: List[str],
languages: Optional[List[str]] = None
) -> np.ndarray:
"""
Batch vectorization for multiple texts.
Args:
texts: List of text strings
languages: Optional list of language codes (same length as texts)
Returns:
numpy array of shape (n_texts, 768)
"""
if languages is None:
languages = ["english"] * len(texts)
embeddings = []
for text, lang in zip(texts, languages):
emb = self.vectorize(text, lang)
embeddings.append(emb)
return np.array(embeddings)
def download_all_models(self):
"""Pre-download all language models"""
for language in self.MODEL_MAP.keys():
try:
logger.info(f"[Vectorizer] Pre-downloading {language} model...")
self._load_model(language)
except Exception as e:
logger.warning(f"[Vectorizer] Failed to download {language}: {e}")
# Singleton instance
_vectorizer: Optional[MultilingualVectorizer] = None
def get_vectorizer(models_cache_dir: Optional[str] = None) -> MultilingualVectorizer:
"""Get or create singleton vectorizer instance"""
global _vectorizer
if _vectorizer is None:
_vectorizer = MultilingualVectorizer(models_cache_dir)
return _vectorizer
def vectorize_text(text: str, language: str = "english") -> np.ndarray:
"""
Convenience function for text vectorization.
Args:
text: Input text
language: Language code
Returns:
768-dim numpy array
"""
return get_vectorizer().vectorize(text, language)