raij-ai / models.py
github-actions[bot]
chore: sync from GitHub 2026-03-11 20:36:29 UTC
e3a42e5
import os
# Lazy-loaded singletons β€” models are loaded on first use, not at startup
_embedder = None
_image_classifier = None # {model, processor, text_features, logit_scale, categories}
_audio_model = None
_absa_pipeline = None
_spacy_nlp = None
def get_embedder():
global _embedder
if _embedder is None:
from langchain_community.embeddings import HuggingFaceEmbeddings
print("Loading embedder model...")
_embedder = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
print("βœ… Embedder loaded")
return _embedder
def get_image_classifier():
"""
Loads CLIP and pre-computes text embeddings for all categories once at startup.
Per-request cost = image encoder only (no text encoding).
Scoring replicates the pipeline exactly:
logits = logit_scale * image_features @ text_features.T
probs = softmax(logits)
"""
global _image_classifier
if _image_classifier is None:
import torch
from transformers import CLIPModel, CLIPProcessor
from utils import load_categories
print("Loading image model...")
model_name = "openai/clip-vit-large-patch14"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)
model.eval()
# Always on CPU β€” apply int8 dynamic quantization to reduce memory and speed up inference
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
categories = load_categories()
text_inputs = processor(text=categories, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
text_features = model.get_text_features(**text_inputs)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
logit_scale = model.logit_scale.exp()
_image_classifier = {
"model": model,
"processor": processor,
"text_features": text_features, # [N_categories, D] β€” cached
"logit_scale": logit_scale, # learned temperature scalar
"categories": categories,
}
print(f"βœ… Image model loaded β€” {len(categories)} category embeddings cached")
return _image_classifier
def classify_image(img):
"""
Classify a PIL image. Only the image encoder runs per request.
Returns (predicted_category, confidence_score).
"""
import torch
c = get_image_classifier()
image_inputs = c["processor"](images=img, return_tensors="pt")
with torch.no_grad():
image_features = c["model"].get_image_features(**image_inputs)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
# Exact same math as the HuggingFace pipeline
logits = c["logit_scale"] * (image_features @ c["text_features"].T) # [1, N]
probs = logits.softmax(dim=-1).squeeze(0)
best_idx = int(probs.argmax())
return c["categories"][best_idx], float(probs[best_idx])
def get_audio_model():
"""
Loads nvidia/parakeet-tdt_ctc-110m via NeMo ASR.
Runs a silent warmup pass so the first real request isn't slow.
"""
global _audio_model
if _audio_model is None:
import os
import wave
import tempfile
import nemo.collections.asr as nemo_asr
print("Loading audio model (nvidia/parakeet-tdt_ctc-110m)...")
_audio_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m")
_audio_model.eval()
# Warmup: transcribe 1 s of silence so the first real request isn't slow
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
warmup_path = f.name
try:
with wave.open(warmup_path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes(b'\x00' * 32000) # 1 s of silence
_audio_model.transcribe([warmup_path])
finally:
os.unlink(warmup_path)
print("βœ… Audio model loaded")
return _audio_model
def get_absa_pipeline():
"""DeBERTa-v3 fine-tuned for Aspect-Based Sentiment Analysis."""
global _absa_pipeline
if _absa_pipeline is None:
from transformers import pipeline
print("Loading ABSA model...")
_absa_pipeline = pipeline(
"text-classification",
model="yangheng/deberta-v3-base-absa-v1.1",
)
print("βœ… ABSA model loaded")
return _absa_pipeline
def get_spacy_nlp():
"""spaCy English model for noun-chunk (aspect) extraction."""
global _spacy_nlp
if _spacy_nlp is None:
import spacy
print("Loading spaCy model...")
_spacy_nlp = spacy.load("en_core_web_sm")
print("βœ… spaCy model loaded")
return _spacy_nlp
# Keep backward-compatible aliases that load on first attribute access
class _LazyProxy:
def __init__(self, loader):
object.__setattr__(self, '_loader', loader)
object.__setattr__(self, '_obj', None)
def _load(self):
if object.__getattribute__(self, '_obj') is None:
obj = object.__getattribute__(self, '_loader')()
object.__setattr__(self, '_obj', obj)
return object.__getattribute__(self, '_obj')
def __getattr__(self, name):
return getattr(self._load(), name)
def __call__(self, *args, **kwargs):
return self._load()(*args, **kwargs)
EMBEDDER = _LazyProxy(get_embedder)
AUDIO_MODEL = _LazyProxy(get_audio_model)
ABSA_PIPELINE = _LazyProxy(get_absa_pipeline)
SPACY_NLP = _LazyProxy(get_spacy_nlp)