| | import os |
| |
|
| | |
| | _embedder = None |
| | _image_classifier = None |
| | _audio_model = None |
| | _absa_pipeline = None |
| | _spacy_nlp = None |
| |
|
| |
|
| | def get_embedder(): |
| | global _embedder |
| | if _embedder is None: |
| | from langchain_community.embeddings import HuggingFaceEmbeddings |
| | print("Loading embedder model...") |
| | _embedder = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') |
| | print("β
Embedder loaded") |
| | return _embedder |
| |
|
| |
|
| | def get_image_classifier(): |
| | """ |
| | Loads CLIP and pre-computes text embeddings for all categories once at startup. |
| | |
| | Per-request cost = image encoder only (no text encoding). |
| | Scoring replicates the pipeline exactly: |
| | logits = logit_scale * image_features @ text_features.T |
| | probs = softmax(logits) |
| | """ |
| | global _image_classifier |
| | if _image_classifier is None: |
| | import torch |
| | from transformers import CLIPModel, CLIPProcessor |
| | from utils import load_categories |
| |
|
| | print("Loading image model...") |
| | model_name = "openai/clip-vit-large-patch14" |
| | processor = CLIPProcessor.from_pretrained(model_name) |
| | model = CLIPModel.from_pretrained(model_name) |
| | model.eval() |
| |
|
| | |
| | model = torch.quantization.quantize_dynamic( |
| | model, {torch.nn.Linear}, dtype=torch.qint8 |
| | ) |
| |
|
| | categories = load_categories() |
| | text_inputs = processor(text=categories, return_tensors="pt", padding=True, truncation=True) |
| | with torch.no_grad(): |
| | text_features = model.get_text_features(**text_inputs) |
| | text_features = text_features / text_features.norm(dim=-1, keepdim=True) |
| | logit_scale = model.logit_scale.exp() |
| |
|
| | _image_classifier = { |
| | "model": model, |
| | "processor": processor, |
| | "text_features": text_features, |
| | "logit_scale": logit_scale, |
| | "categories": categories, |
| | } |
| | print(f"β
Image model loaded β {len(categories)} category embeddings cached") |
| | return _image_classifier |
| |
|
| |
|
| | def classify_image(img): |
| | """ |
| | Classify a PIL image. Only the image encoder runs per request. |
| | Returns (predicted_category, confidence_score). |
| | """ |
| | import torch |
| | c = get_image_classifier() |
| | image_inputs = c["processor"](images=img, return_tensors="pt") |
| | with torch.no_grad(): |
| | image_features = c["model"].get_image_features(**image_inputs) |
| | image_features = image_features / image_features.norm(dim=-1, keepdim=True) |
| |
|
| | |
| | logits = c["logit_scale"] * (image_features @ c["text_features"].T) |
| | probs = logits.softmax(dim=-1).squeeze(0) |
| | best_idx = int(probs.argmax()) |
| | return c["categories"][best_idx], float(probs[best_idx]) |
| |
|
| |
|
| | def get_audio_model(): |
| | """ |
| | Loads nvidia/parakeet-tdt_ctc-110m via NeMo ASR. |
| | Runs a silent warmup pass so the first real request isn't slow. |
| | """ |
| | global _audio_model |
| | if _audio_model is None: |
| | import os |
| | import wave |
| | import tempfile |
| | import nemo.collections.asr as nemo_asr |
| | print("Loading audio model (nvidia/parakeet-tdt_ctc-110m)...") |
| | _audio_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m") |
| | _audio_model.eval() |
| | |
| | with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f: |
| | warmup_path = f.name |
| | try: |
| | with wave.open(warmup_path, 'wb') as wf: |
| | wf.setnchannels(1) |
| | wf.setsampwidth(2) |
| | wf.setframerate(16000) |
| | wf.writeframes(b'\x00' * 32000) |
| | _audio_model.transcribe([warmup_path]) |
| | finally: |
| | os.unlink(warmup_path) |
| | print("β
Audio model loaded") |
| | return _audio_model |
| |
|
| |
|
| | def get_absa_pipeline(): |
| | """DeBERTa-v3 fine-tuned for Aspect-Based Sentiment Analysis.""" |
| | global _absa_pipeline |
| | if _absa_pipeline is None: |
| | from transformers import pipeline |
| | print("Loading ABSA model...") |
| | _absa_pipeline = pipeline( |
| | "text-classification", |
| | model="yangheng/deberta-v3-base-absa-v1.1", |
| | ) |
| | print("β
ABSA model loaded") |
| | return _absa_pipeline |
| |
|
| |
|
| | def get_spacy_nlp(): |
| | """spaCy English model for noun-chunk (aspect) extraction.""" |
| | global _spacy_nlp |
| | if _spacy_nlp is None: |
| | import spacy |
| | print("Loading spaCy model...") |
| | _spacy_nlp = spacy.load("en_core_web_sm") |
| | print("β
spaCy model loaded") |
| | return _spacy_nlp |
| |
|
| |
|
| | |
| | class _LazyProxy: |
| | def __init__(self, loader): |
| | object.__setattr__(self, '_loader', loader) |
| | object.__setattr__(self, '_obj', None) |
| |
|
| | def _load(self): |
| | if object.__getattribute__(self, '_obj') is None: |
| | obj = object.__getattribute__(self, '_loader')() |
| | object.__setattr__(self, '_obj', obj) |
| | return object.__getattribute__(self, '_obj') |
| |
|
| | def __getattr__(self, name): |
| | return getattr(self._load(), name) |
| |
|
| | def __call__(self, *args, **kwargs): |
| | return self._load()(*args, **kwargs) |
| |
|
| |
|
| | EMBEDDER = _LazyProxy(get_embedder) |
| | AUDIO_MODEL = _LazyProxy(get_audio_model) |
| | ABSA_PIPELINE = _LazyProxy(get_absa_pipeline) |
| | SPACY_NLP = _LazyProxy(get_spacy_nlp) |