Spaces:

ICMLABS
/

Terrasyncra

Sleeping

App Files Files Community

Terrasyncra / app /utils /model_manager.py

nexusbert

Initial TerraSyncra AI deployment - CPU optimized with lazy loading and Qwen 1.8B model

9ebe82e 3 months ago

raw

history blame contribute delete

6.31 kB

	# TerraSyncra/app/utils/model_manager.py
	"""
	Lazy Model Manager for CPU Optimization
	Loads models on-demand instead of at import time.
	"""
	import os
	import logging
	import torch
	from typing import Optional
	from functools import lru_cache

	logging.basicConfig(level=logging.INFO)

	# Global model cache
	_models = {
	"expert_model": None,
	"expert_tokenizer": None,
	"translation_model": None,
	"translation_tokenizer": None,
	"embedder": None,
	"lang_identifier": None,
	"classifier": None,
	}

	_device = "cpu" # Force CPU for HuggingFace Spaces


	def get_device():
	"""Always return CPU for HuggingFace Spaces."""
	return _device


	def load_expert_model(model_name: str, use_quantization: bool = True):
	"""
	Lazy load expert model with optional quantization.

	Args:
	model_name: Model identifier
	use_quantization: Use INT8 quantization for CPU (recommended)
	"""
	if _models["expert_model"] is not None:
	return _models["expert_tokenizer"], _models["expert_model"]

	from transformers import AutoTokenizer, AutoModelForCausalLM
	from app.utils import config

	logging.info(f"Loading expert model ({model_name})...")

	# Get cache directory from config
	cache_dir = getattr(config, 'hf_cache', '/models/huggingface')

	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	use_fast=True, # Use fast tokenizer
	cache_dir=cache_dir
	)

	# Load model with CPU optimizations
	model_kwargs = {
	"torch_dtype": torch.float32, # Use float32 for CPU
	"device_map": "cpu",
	"low_cpu_mem_usage": True,
	}

	# Note: For CPU, we use float32 (most compatible)
	# For quantization on CPU, consider using smaller models or ONNX runtime
	# BitsAndBytesConfig is GPU-only, so we skip it for CPU deployment
	logging.info("Loading model in float32 for CPU compatibility")

	cache_dir = getattr(config, 'hf_cache', '/models/huggingface')

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	cache_dir=cache_dir,
	**model_kwargs
	)

	model.eval() # Set to evaluation mode

	_models["expert_model"] = model
	_models["expert_tokenizer"] = tokenizer

	logging.info("Expert model loaded successfully")
	return tokenizer, model


	def load_translation_model(model_name: str):
	"""Lazy load translation model."""
	if _models["translation_model"] is not None:
	return _models["translation_tokenizer"], _models["translation_model"]

	from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
	from app.utils import config

	logging.info(f"Loading translation model ({model_name})...")

	cache_dir = getattr(config, 'hf_cache', '/models/huggingface')

	tokenizer = NllbTokenizer.from_pretrained(
	model_name,
	cache_dir=cache_dir
	)

	model = AutoModelForSeq2SeqLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32, # CPU uses float32
	cache_dir=cache_dir,
	device_map="cpu",
	low_cpu_mem_usage=True
	)

	model.eval()

	_models["translation_model"] = model
	_models["translation_tokenizer"] = tokenizer

	logging.info("Translation model loaded successfully")
	return tokenizer, model


	def load_embedder(model_name: str):
	"""Lazy load sentence transformer embedder."""
	if _models["embedder"] is not None:
	return _models["embedder"]

	from sentence_transformers import SentenceTransformer
	from app.utils import config

	logging.info(f"Loading embedder ({model_name})...")

	cache_folder = getattr(config, 'hf_cache', '/models/huggingface')

	embedder = SentenceTransformer(
	model_name,
	device=_device,
	cache_folder=cache_folder
	)

	_models["embedder"] = embedder

	logging.info("Embedder loaded successfully")
	return embedder


	def load_lang_identifier(repo_id: str, filename: str = "model.bin"):
	"""Lazy load FastText language identifier."""
	if _models["lang_identifier"] is not None:
	return _models["lang_identifier"]

	import fasttext
	from huggingface_hub import hf_hub_download
	from app.utils import config

	logging.info(f"Loading language identifier ({repo_id})...")

	cache_dir = getattr(config, 'hf_cache', '/models/huggingface')

	lang_model_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	cache_dir=cache_dir
	)

	lang_identifier = fasttext.load_model(lang_model_path)

	_models["lang_identifier"] = lang_identifier

	logging.info("Language identifier loaded successfully")
	return lang_identifier


	def load_classifier(classifier_path: str):
	"""Lazy load intent classifier."""
	if _models["classifier"] is not None:
	return _models["classifier"]

	import joblib
	from pathlib import Path

	logging.info(f"Loading classifier ({classifier_path})...")

	if not Path(classifier_path).exists():
	logging.warning(f"Classifier not found at {classifier_path}")
	return None

	try:
	classifier = joblib.load(classifier_path)
	_models["classifier"] = classifier
	logging.info("Classifier loaded successfully")
	return classifier
	except Exception as e:
	logging.error(f"Failed to load classifier: {e}")
	return None


	def clear_model_cache():
	"""Clear all loaded models from memory."""
	global _models
	for key in _models:
	if _models[key] is not None:
	del _models[key]
	_models[key] = None
	import gc
	gc.collect()
	logging.info("Model cache cleared")


	def get_model_memory_usage():
	"""Get approximate memory usage of loaded models."""
	usage = {}
	if _models["expert_model"] is not None:
	# Rough estimate: 4B params * 4 bytes = 16 GB
	usage["expert_model"] = "~16 GB"
	if _models["translation_model"] is not None:
	usage["translation_model"] = "~2-5 GB"
	if _models["embedder"] is not None:
	usage["embedder"] = "~1 GB"
	if _models["lang_identifier"] is not None:
	usage["lang_identifier"] = "~200 MB"
	return usage