Spaces:

Hamdy005
/

raij-ai

Running

raij-ai / models.py

github-actions[bot]

chore: sync from GitHub 2026-03-11 20:36:29 UTC

e3a42e5 about 19 hours ago

5.76 kB

	import os

	# Lazy-loaded singletons — models are loaded on first use, not at startup
	_embedder = None
	_image_classifier = None # {model, processor, text_features, logit_scale, categories}
	_audio_model = None
	_absa_pipeline = None
	_spacy_nlp = None


	def get_embedder():
	global _embedder
	if _embedder is None:
	from langchain_community.embeddings import HuggingFaceEmbeddings
	print("Loading embedder model...")
	_embedder = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	print("✅ Embedder loaded")
	return _embedder


	def get_image_classifier():
	"""
	Loads CLIP and pre-computes text embeddings for all categories once at startup.

	Per-request cost = image encoder only (no text encoding).
	Scoring replicates the pipeline exactly:
	logits = logit_scale * image_features @ text_features.T
	probs = softmax(logits)
	"""
	global _image_classifier
	if _image_classifier is None:
	import torch
	from transformers import CLIPModel, CLIPProcessor
	from utils import load_categories

	print("Loading image model...")
	model_name = "openai/clip-vit-large-patch14"
	processor = CLIPProcessor.from_pretrained(model_name)
	model = CLIPModel.from_pretrained(model_name)
	model.eval()

	# Always on CPU — apply int8 dynamic quantization to reduce memory and speed up inference
	model = torch.quantization.quantize_dynamic(
	model, {torch.nn.Linear}, dtype=torch.qint8
	)

	categories = load_categories()
	text_inputs = processor(text=categories, return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	text_features = model.get_text_features(**text_inputs)
	text_features = text_features / text_features.norm(dim=-1, keepdim=True)
	logit_scale = model.logit_scale.exp()

	_image_classifier = {
	"model": model,
	"processor": processor,
	"text_features": text_features, # [N_categories, D] — cached
	"logit_scale": logit_scale, # learned temperature scalar
	"categories": categories,
	}
	print(f"✅ Image model loaded — {len(categories)} category embeddings cached")
	return _image_classifier


	def classify_image(img):
	"""
	Classify a PIL image. Only the image encoder runs per request.
	Returns (predicted_category, confidence_score).
	"""
	import torch
	c = get_image_classifier()
	image_inputs = c["processor"](images=img, return_tensors="pt")
	with torch.no_grad():
	image_features = c["model"].get_image_features(**image_inputs)
	image_features = image_features / image_features.norm(dim=-1, keepdim=True)

	# Exact same math as the HuggingFace pipeline
	logits = c["logit_scale"] * (image_features @ c["text_features"].T) # [1, N]
	probs = logits.softmax(dim=-1).squeeze(0)
	best_idx = int(probs.argmax())
	return c["categories"][best_idx], float(probs[best_idx])


	def get_audio_model():
	"""
	Loads nvidia/parakeet-tdt_ctc-110m via NeMo ASR.
	Runs a silent warmup pass so the first real request isn't slow.
	"""
	global _audio_model
	if _audio_model is None:
	import os
	import wave
	import tempfile
	import nemo.collections.asr as nemo_asr
	print("Loading audio model (nvidia/parakeet-tdt_ctc-110m)...")
	_audio_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/parakeet-tdt_ctc-110m")
	_audio_model.eval()
	# Warmup: transcribe 1 s of silence so the first real request isn't slow
	with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
	warmup_path = f.name
	try:
	with wave.open(warmup_path, 'wb') as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(16000)
	wf.writeframes(b'\x00' * 32000) # 1 s of silence
	_audio_model.transcribe([warmup_path])
	finally:
	os.unlink(warmup_path)
	print("✅ Audio model loaded")
	return _audio_model


	def get_absa_pipeline():
	"""DeBERTa-v3 fine-tuned for Aspect-Based Sentiment Analysis."""
	global _absa_pipeline
	if _absa_pipeline is None:
	from transformers import pipeline
	print("Loading ABSA model...")
	_absa_pipeline = pipeline(
	"text-classification",
	model="yangheng/deberta-v3-base-absa-v1.1",
	)
	print("✅ ABSA model loaded")
	return _absa_pipeline


	def get_spacy_nlp():
	"""spaCy English model for noun-chunk (aspect) extraction."""
	global _spacy_nlp
	if _spacy_nlp is None:
	import spacy
	print("Loading spaCy model...")
	_spacy_nlp = spacy.load("en_core_web_sm")
	print("✅ spaCy model loaded")
	return _spacy_nlp


	# Keep backward-compatible aliases that load on first attribute access
	class _LazyProxy:
	def __init__(self, loader):
	object.__setattr__(self, '_loader', loader)
	object.__setattr__(self, '_obj', None)

	def _load(self):
	if object.__getattribute__(self, '_obj') is None:
	obj = object.__getattribute__(self, '_loader')()
	object.__setattr__(self, '_obj', obj)
	return object.__getattribute__(self, '_obj')

	def __getattr__(self, name):
	return getattr(self._load(), name)

	def __call__(self, args, *kwargs):
	return self._load()(args, *kwargs)


	EMBEDDER = _LazyProxy(get_embedder)
	AUDIO_MODEL = _LazyProxy(get_audio_model)
	ABSA_PIPELINE = _LazyProxy(get_absa_pipeline)
	SPACY_NLP = _LazyProxy(get_spacy_nlp)