Spaces:

anu151105
/

agentic-browser

Sleeping

App Files Files Community

agentic-browser / src /models /model_manager.py

anu151105

Update application configuration and fix imports

8cd3581 6 months ago

raw

history blame contribute delete

8.81 kB

	"""
	Model Manager for handling local model loading and inference.
	"""
	import os
	import sys
	import torch
	from typing import Dict, Any, Optional, List, Union
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	AutoModel,
	pipeline,
	BitsAndBytesConfig
	)
	from sentence_transformers import SentenceTransformer

	# Add parent directories to path
	current_dir = os.path.dirname(os.path.abspath(__file__))
	src_dir = os.path.dirname(current_dir)
	root_dir = os.path.dirname(src_dir)

	for path in [root_dir, src_dir]:
	if path not in sys.path:
	sys.path.insert(0, path)

	try:
	from config.model_config import get_model_path, get_model_config, ModelConfig
	except ImportError:
	# Fallback import paths
	try:
	from src.config.model_config import get_model_path, get_model_config, ModelConfig
	except ImportError:
	# Define minimal fallback config
	from dataclasses import dataclass
	from typing import Literal

	ModelType = Literal["text-generation", "text-embedding", "vision", "multimodal"]
	DeviceType = Literal["auto", "cpu", "cuda"]

	@dataclass
	class ModelConfig:
	model_id: str
	model_path: str
	model_type: ModelType
	device: DeviceType = "auto"
	quantize: bool = True
	use_safetensors: bool = True
	trust_remote_code: bool = True
	description: str = ""
	size_gb: float = 0.0
	recommended: bool = False

	# Fallback model configurations
	DEFAULT_MODELS = {
	"tiny-llama": ModelConfig(
	model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	model_path="./models/tiny-llama-1.1b-chat",
	model_type="text-generation",
	quantize=False, # Disable quantization for HF spaces
	description="Very small and fast model, good for quick testing",
	size_gb=1.1,
	recommended=True
	),
	"mistral-7b": ModelConfig(
	model_id="microsoft/DialoGPT-small", # Use smaller model for HF spaces
	model_path="./models/dialogpt-small",
	model_type="text-generation",
	quantize=False,
	description="Small conversational model",
	size_gb=0.5,
	recommended=True
	)
	}

	def get_model_config(model_name: str) -> Optional[ModelConfig]:
	return DEFAULT_MODELS.get(model_name)

	def get_model_path(model_name: str) -> str:
	config = get_model_config(model_name)
	if not config:
	raise ValueError(f"Unknown model: {model_name}")

	# For HF spaces, use the model_id directly
	return config.model_id

	class ModelManager:
	"""Manages loading and using local models."""

	def __init__(self, device: str = None):
	self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
	self.models: Dict[str, Any] = {}
	self.tokenizers: Dict[str, Any] = {}
	self.pipelines: Dict[str, Any] = {}

	def load_model(self, model_name: str, **kwargs) -> Any:
	"""Load a model by name."""
	if model_name in self.models:
	return self.models[model_name]

	config = get_model_config(model_name)
	if not config:
	raise ValueError(f"Unknown model: {model_name}")

	model_path = get_model_path(model_name)

	# Load model based on type
	if config.model_type == "text-generation":
	model = self._load_text_generation_model(model_name, config, **kwargs)
	elif config.model_type == "text-embedding":
	model = self._load_embedding_model(model_name, config, **kwargs)
	else:
	raise ValueError(f"Unsupported model type: {config.model_type}")

	self.models[model_name] = model
	return model

	def _load_text_generation_model(self, model_name: str, config: ModelConfig, **kwargs):
	"""Load a text generation model."""
	model_path = get_model_path(model_name)

	try:
	# Try to load with quantization if supported
	if config.quantize and "gptq" in config.model_id.lower():
	try:
	from auto_gptq import AutoGPTQForCausalLM
	model = AutoGPTQForCausalLM.from_quantized(
	model_path,
	device_map="auto",
	trust_remote_code=config.trust_remote_code,
	use_safetensors=config.use_safetensors,
	**kwargs
	)
	except ImportError:
	# Fallback to regular loading if auto_gptq is not available
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto" if torch.cuda.is_available() else "cpu",
	trust_remote_code=config.trust_remote_code,
	**kwargs
	)
	else:
	# Load full precision model
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto" if torch.cuda.is_available() else "cpu",
	trust_remote_code=config.trust_remote_code,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	**kwargs
	)
	except Exception as e:
	print(f"Error loading model {model_name}: {e}")
	# Fallback to CPU loading
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="cpu",
	trust_remote_code=config.trust_remote_code,
	torch_dtype=torch.float32,
	**kwargs
	)

	# Load tokenizer
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	trust_remote_code=config.trust_remote_code
	)
	except Exception as e:
	print(f"Error loading tokenizer for {model_name}: {e}")
	# Use a basic tokenizer as fallback
	tokenizer = AutoTokenizer.from_pretrained("gpt2")

	self.tokenizers[model_name] = tokenizer

	return model

	def _load_embedding_model(self, model_name: str, config: ModelConfig, **kwargs):
	"""Load a text embedding model."""
	model_path = get_model_path(model_name)

	# Use sentence-transformers for embedding models
	model = SentenceTransformer(
	model_path,
	device=self.device,
	**kwargs
	)

	return model

	def generate_text(
	self,
	model_name: str,
	prompt: str,
	max_length: int = 512,
	temperature: float = 0.7,
	**generation_kwargs
	) -> str:
	"""Generate text using the specified model."""
	if model_name not in self.models:
	self.load_model(model_name)

	model = self.models[model_name]
	tokenizer = self.tokenizers[model_name]

	# Encode the input
	inputs = tokenizer(prompt, return_tensors="pt").to(self.device)

	# Generate text
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	temperature=temperature,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	**generation_kwargs
	)

	# Decode and return the generated text
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return generated_text

	def get_embeddings(
	self,
	model_name: str,
	texts: Union[str, List[str]],
	batch_size: int = 32,
	**kwargs
	) -> torch.Tensor:
	"""Get embeddings for the input texts."""
	if model_name not in self.models:
	self.load_model(model_name)

	model = self.models[model_name]

	# Get embeddings using sentence-transformers
	if isinstance(texts, str):
	texts = [texts]

	embeddings = model.encode(
	texts,
	batch_size=batch_size,
	show_progress_bar=len(texts) > 1,
	convert_to_tensor=True,
	**kwargs
	)

	return embeddings

	# Global model manager instance
	model_manager = ModelManager()