|
|
""" |
|
|
models/anomaly-detection/src/utils/vectorizer.py |
|
|
Text vectorization using language-specific BERT models (downloaded locally) |
|
|
""" |
|
|
import os |
|
|
import logging |
|
|
from typing import Dict, List, Optional, Tuple |
|
|
from pathlib import Path |
|
|
import numpy as np |
|
|
|
|
|
logger = logging.getLogger("vectorizer") |
|
|
|
|
|
|
|
|
try: |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
import torch |
|
|
TRANSFORMERS_AVAILABLE = True |
|
|
except ImportError: |
|
|
TRANSFORMERS_AVAILABLE = False |
|
|
logger.warning("Transformers not available. Install with: pip install transformers torch") |
|
|
|
|
|
|
|
|
try: |
|
|
from sentence_transformers import SentenceTransformer |
|
|
SENTENCE_TRANSFORMERS_AVAILABLE = True |
|
|
except ImportError: |
|
|
SENTENCE_TRANSFORMERS_AVAILABLE = False |
|
|
|
|
|
|
|
|
class MultilingualVectorizer: |
|
|
""" |
|
|
Vectorizer using language-specific BERT models. |
|
|
Downloads and caches models locally from HuggingFace. |
|
|
|
|
|
Models: |
|
|
- English: distilbert-base-uncased (fast, accurate) |
|
|
- Sinhala: keshan/SinhalaBERTo (specialized) |
|
|
- Tamil: l3cube-pune/tamil-bert (specialized) |
|
|
""" |
|
|
|
|
|
MODEL_MAP = { |
|
|
"english": "distilbert-base-uncased", |
|
|
"sinhala": "keshan/SinhalaBERTo", |
|
|
"tamil": "l3cube-pune/tamil-bert" |
|
|
} |
|
|
|
|
|
def __init__(self, models_cache_dir: Optional[str] = None, device: Optional[str] = None): |
|
|
""" |
|
|
Initialize the multilingual vectorizer. |
|
|
|
|
|
Args: |
|
|
models_cache_dir: Directory to cache downloaded models |
|
|
device: 'cuda' or 'cpu' (auto-detected if None) |
|
|
""" |
|
|
self.models_cache_dir = models_cache_dir or str( |
|
|
Path(__file__).parent.parent.parent / "models_cache" |
|
|
) |
|
|
Path(self.models_cache_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
os.environ["TRANSFORMERS_CACHE"] = self.models_cache_dir |
|
|
os.environ["HF_HOME"] = self.models_cache_dir |
|
|
|
|
|
|
|
|
if device is None: |
|
|
if TRANSFORMERS_AVAILABLE and torch.cuda.is_available(): |
|
|
self.device = "cuda" |
|
|
else: |
|
|
self.device = "cpu" |
|
|
else: |
|
|
self.device = device |
|
|
|
|
|
logger.info(f"[Vectorizer] Using device: {self.device}") |
|
|
|
|
|
|
|
|
self.models: Dict[str, Tuple] = {} |
|
|
self.fallback_model = None |
|
|
|
|
|
def _load_model(self, language: str) -> Tuple: |
|
|
""" |
|
|
Load language-specific model from cache or download. |
|
|
|
|
|
Returns: |
|
|
Tuple of (tokenizer, model) |
|
|
""" |
|
|
if language in self.models: |
|
|
return self.models[language] |
|
|
|
|
|
model_name = self.MODEL_MAP.get(language, self.MODEL_MAP["english"]) |
|
|
|
|
|
if not TRANSFORMERS_AVAILABLE: |
|
|
raise RuntimeError("Transformers library not available") |
|
|
|
|
|
logger.info(f"[Vectorizer] Loading model: {model_name}") |
|
|
|
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
|
model_name, |
|
|
cache_dir=self.models_cache_dir |
|
|
) |
|
|
model = AutoModel.from_pretrained( |
|
|
model_name, |
|
|
cache_dir=self.models_cache_dir |
|
|
).to(self.device) |
|
|
model.eval() |
|
|
|
|
|
self.models[language] = (tokenizer, model) |
|
|
logger.info(f"[Vectorizer] ✓ Loaded {model_name} ({language})") |
|
|
return tokenizer, model |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"[Vectorizer] Failed to load {model_name}: {e}") |
|
|
|
|
|
if language != "english": |
|
|
logger.info("[Vectorizer] Falling back to English model") |
|
|
return self._load_model("english") |
|
|
raise |
|
|
|
|
|
def _get_embedding(self, text: str, tokenizer, model) -> np.ndarray: |
|
|
""" |
|
|
Get embedding vector using mean pooling. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
tokenizer: HuggingFace tokenizer |
|
|
model: HuggingFace model |
|
|
|
|
|
Returns: |
|
|
768-dim numpy array |
|
|
""" |
|
|
if not TRANSFORMERS_AVAILABLE: |
|
|
raise RuntimeError("Transformers not available") |
|
|
|
|
|
|
|
|
inputs = tokenizer( |
|
|
text, |
|
|
return_tensors="pt", |
|
|
truncation=True, |
|
|
max_length=512, |
|
|
padding=True |
|
|
).to(self.device) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
|
attention_mask = inputs["attention_mask"] |
|
|
hidden_states = outputs.last_hidden_state |
|
|
|
|
|
|
|
|
mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float() |
|
|
sum_embeddings = torch.sum(hidden_states * mask_expanded, 1) |
|
|
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) |
|
|
mean_embedding = sum_embeddings / sum_mask |
|
|
|
|
|
return mean_embedding.cpu().numpy().flatten() |
|
|
|
|
|
def vectorize(self, text: str, language: str = "english") -> np.ndarray: |
|
|
""" |
|
|
Convert text to vector embedding. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
language: 'english', 'sinhala', 'tamil', or 'unknown' |
|
|
|
|
|
Returns: |
|
|
768-dim numpy array |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
return np.zeros(768) |
|
|
|
|
|
|
|
|
if language == "unknown": |
|
|
language = "english" |
|
|
|
|
|
try: |
|
|
tokenizer, model = self._load_model(language) |
|
|
return self._get_embedding(text, tokenizer, model) |
|
|
except Exception as e: |
|
|
logger.error(f"[Vectorizer] Error vectorizing: {e}") |
|
|
|
|
|
return np.zeros(768) |
|
|
|
|
|
def vectorize_batch( |
|
|
self, |
|
|
texts: List[str], |
|
|
languages: Optional[List[str]] = None |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Batch vectorization for multiple texts. |
|
|
|
|
|
Args: |
|
|
texts: List of text strings |
|
|
languages: Optional list of language codes (same length as texts) |
|
|
|
|
|
Returns: |
|
|
numpy array of shape (n_texts, 768) |
|
|
""" |
|
|
if languages is None: |
|
|
languages = ["english"] * len(texts) |
|
|
|
|
|
embeddings = [] |
|
|
for text, lang in zip(texts, languages): |
|
|
emb = self.vectorize(text, lang) |
|
|
embeddings.append(emb) |
|
|
|
|
|
return np.array(embeddings) |
|
|
|
|
|
def download_all_models(self): |
|
|
"""Pre-download all language models""" |
|
|
for language in self.MODEL_MAP.keys(): |
|
|
try: |
|
|
logger.info(f"[Vectorizer] Pre-downloading {language} model...") |
|
|
self._load_model(language) |
|
|
except Exception as e: |
|
|
logger.warning(f"[Vectorizer] Failed to download {language}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
_vectorizer: Optional[MultilingualVectorizer] = None |
|
|
|
|
|
|
|
|
def get_vectorizer(models_cache_dir: Optional[str] = None) -> MultilingualVectorizer: |
|
|
"""Get or create singleton vectorizer instance""" |
|
|
global _vectorizer |
|
|
if _vectorizer is None: |
|
|
_vectorizer = MultilingualVectorizer(models_cache_dir) |
|
|
return _vectorizer |
|
|
|
|
|
|
|
|
def vectorize_text(text: str, language: str = "english") -> np.ndarray: |
|
|
""" |
|
|
Convenience function for text vectorization. |
|
|
|
|
|
Args: |
|
|
text: Input text |
|
|
language: Language code |
|
|
|
|
|
Returns: |
|
|
768-dim numpy array |
|
|
""" |
|
|
return get_vectorizer().vectorize(text, language) |
|
|
|