Spaces:

blakeurmos
/

mayahq

Sleeping

File size: 20,251 Bytes

"""
HuggingFace Model Interface for Maya Gradio Demo
Supports multiple models and providers
"""

import os
import logging
from typing import Dict, List, Optional, Any
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline,
    BitsAndBytesConfig
)
from peft import PeftModel
import torch
from huggingface_hub import HfApi
import json

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ModelInterface:
    """
    Interface for managing multiple HuggingFace models
    Supports local models, HF Inference API, and custom fine-tuned models
    """
    
    def __init__(self):
        """Initialize model interface"""
        self.models = {}
        self.current_model = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.hf_api = HfApi()
        
        # Configure quantization for memory efficiency
        self.quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        ) if torch.cuda.is_available() else None
        
        logger.info(f"Model interface initialized on device: {self.device}")
        
        # Define available models (optimized for HuggingFace Spaces)
        self.available_models = {
            # Maya's fine-tuned LoRA model via inference API (requires Pro account)
            "blakeurmos/maya-7b-lora-v1": {
                "name": "Maya 7B (Fine-tuned)",
                "description": "Maya's personality fine-tuned on Mistral-7B (requires auth)",
                "size": "LoRA (~14MB + base model)",
                "type": "inference_api",
                "requires_auth": True,
                "base_model": "mistralai/Mistral-7B-Instruct-v0.3"  # Original trained base
            },
            # Backup Maya model using non-gated Mistral
            "mistralai/Mistral-7B-Instruct-v0.1": {
                "name": "Maya 7B (Mistral Base)",
                "description": "Mistral 7B with Maya personality via prompting",
                "size": "Large (~7B params)",
                "type": "inference_api",
                "requires_auth": False
            },
            # Latest Mistral instruction model
            "mistralai/Mistral-7B-Instruct-v0.3": {
                "name": "Mistral 7B Instruct v0.3",
                "description": "Mistral's latest instruction-tuned model",
                "size": "Large (~7B params)", 
                "type": "inference_api",
                "requires_auth": True
            },
            # Moonshot AI's latest model
            "moonshotai/Kimi-K2-Instruct": {
                "name": "Kimi K2 Instruct",
                "description": "Moonshot AI's latest instruction model",
                "size": "Large",
                "type": "inference_api",
                "requires_auth": True
            }
        }
    
    def get_available_models(self) -> Dict[str, Dict[str, Any]]:
        """Get list of available models with metadata"""
        return self.available_models
    
    def _load_as_inference_api(self, model_id: str, use_auth_token: bool = False) -> bool:
        """Load model using inference API as fallback"""
        try:
            logger.info(f"Loading {model_id} via inference API fallback")
            
            auth_token = None
            if use_auth_token:
                auth_token = (
                    os.getenv("HUGGINGFACE_API_TOKEN") or 
                    os.getenv("HF_TOKEN") or 
                    os.getenv("HUGGINGFACE_TOKEN") or
                    os.getenv("HF_API_TOKEN")
                )
            
            pipe = pipeline(
                "text-generation",
                model=model_id,
                token=auth_token,
                device=0 if torch.cuda.is_available() else -1
            )
            
            self.models[model_id] = {
                "pipeline": pipe,
                "type": "inference_api",
                "tokenizer": None
            }
            
            return True
            
        except Exception as e:
            logger.error(f"Inference API fallback also failed for {model_id}: {e}")
            return False
    
    def load_model(self, model_id: str, use_auth_token: bool = False) -> bool:
        """
        Load a model for inference
        
        Args:
            model_id: HuggingFace model identifier
            use_auth_token: Whether to use HF auth token
            
        Returns:
            True if successful, False otherwise
        """
        try:
            if model_id in self.models:
                logger.info(f"Model {model_id} already loaded")
                self.current_model = model_id
                return True
            
            model_config = self.available_models.get(model_id, {})
            model_type = model_config.get("type", "local")
            
            if model_type == "inference_api":
                # For inference API, just create a pipeline
                logger.info(f"Setting up inference API pipeline for {model_id}")
                
                # Use auth token if available - check multiple possible env vars
                auth_token = None
                if use_auth_token:
                    auth_token = (
                        os.getenv("HUGGINGFACE_API_TOKEN") or 
                        os.getenv("HF_TOKEN") or 
                        os.getenv("HUGGINGFACE_TOKEN") or
                        os.getenv("HF_API_TOKEN")
                    )
                    if auth_token:
                        logger.info("Using HuggingFace authentication token")
                    else:
                        logger.warning("Auth requested but no HF token found in environment")
                
                pipe = pipeline(
                    "text-generation",
                    model=model_id,
                    token=auth_token,
                    device=0 if torch.cuda.is_available() else -1
                )
                
                self.models[model_id] = {
                    "pipeline": pipe,
                    "type": "inference_api",
                    "tokenizer": None
                }
                
            elif model_type in ["local", "custom"]:
                # Load model locally
                logger.info(f"Loading local model {model_id}...")
                
                # Check if model exists (especially for custom models)
                if model_config.get("exists", True) == False:
                    try:
                        # Try to check if the model exists on HF Hub
                        model_info = self.hf_api.model_info(model_id)
                        logger.info(f"Found model {model_id} on HuggingFace Hub")
                    except Exception as e:
                        logger.error(f"Model {model_id} not found: {e}")
                        return False
                
                # Load tokenizer
                auth_token = os.getenv("HUGGINGFACE_API_TOKEN") if use_auth_token else None
                tokenizer = AutoTokenizer.from_pretrained(
                    model_id,
                    token=auth_token,
                    padding_side="left"
                )
                
                # Add pad token if missing
                if tokenizer.pad_token is None:
                    tokenizer.pad_token = tokenizer.eos_token
                
                # Load model with quantization if available
                load_kwargs = {
                    "token": auth_token,
                    "torch_dtype": torch.float16,
                    "device_map": "auto" if torch.cuda.is_available() else None
                }
                
                if self.quantization_config and torch.cuda.is_available():
                    load_kwargs["quantization_config"] = self.quantization_config
                
                model = AutoModelForCausalLM.from_pretrained(
                    model_id,
                    **load_kwargs
                )
                
                # Create pipeline
                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    device=0 if torch.cuda.is_available() else -1,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
                )
                
                self.models[model_id] = {
                    "pipeline": pipe,
                    "tokenizer": tokenizer,
                    "model": model,
                    "type": "local"
                }
            
            elif model_type == "lora":
                # Load LoRA adapter with base model
                logger.info(f"Loading LoRA model {model_id}...")
                
                base_model_id = model_config.get("base_model")
                if not base_model_id:
                    logger.error(f"No base model specified for LoRA {model_id}")
                    return False
                
                # Use auth token if available - check multiple possible env vars
                auth_token = None
                if use_auth_token:
                    auth_token = (
                        os.getenv("HUGGINGFACE_API_TOKEN") or 
                        os.getenv("HF_TOKEN") or 
                        os.getenv("HUGGINGFACE_TOKEN") or
                        os.getenv("HF_API_TOKEN")
                    )
                    if auth_token:
                        logger.info("Using HuggingFace authentication token")
                    else:
                        logger.warning("Auth requested but no HF token found in environment")
                
                # Try loading base model, with fallback for Maya LoRA
                logger.info(f"Loading base model {base_model_id}...")
                
                try:
                    base_model = AutoModelForCausalLM.from_pretrained(
                        base_model_id,
                        token=auth_token,
                        torch_dtype=torch.float16,
                        device_map="auto" if torch.cuda.is_available() else None,
                        low_cpu_mem_usage=True
                    )
                except Exception as base_error:
                    logger.warning(f"Failed to load base model {base_model_id}: {base_error}")
                    
                    # Check if there's a fallback base model
                    fallback_base = model_config.get("fallback_base")
                    if fallback_base and fallback_base != base_model_id:
                        logger.info(f"Trying fallback base model {fallback_base}...")
                        try:
                            base_model = AutoModelForCausalLM.from_pretrained(
                                fallback_base,
                                token=auth_token,
                                torch_dtype=torch.float16,
                                device_map="auto" if torch.cuda.is_available() else None,
                                low_cpu_mem_usage=True
                            )
                            logger.info(f"Successfully loaded fallback base model {fallback_base}")
                        except Exception as fallback_error:
                            logger.error(f"Fallback base model also failed: {fallback_error}")
                            # Convert to inference API mode as last resort
                            logger.info("Converting to inference API mode...")
                            return self._load_as_inference_api(model_id, use_auth_token)
                    else:
                        logger.error(f"No fallback available for base model {base_model_id}")
                        return False
                
                # Load LoRA adapter
                logger.info(f"Loading LoRA adapter {model_id}...")
                model = PeftModel.from_pretrained(base_model, model_id, token=auth_token)
                
                # Load tokenizer (from base model) with fallback
                try:
                    tokenizer = AutoTokenizer.from_pretrained(
                        base_model_id,
                        token=auth_token,
                        padding_side="left",
                        use_fast=True  # Try fast tokenizer first
                    )
                except Exception as tokenizer_error:
                    logger.warning(f"Fast tokenizer failed: {tokenizer_error}, trying slow tokenizer...")
                    try:
                        tokenizer = AutoTokenizer.from_pretrained(
                            base_model_id,
                            token=auth_token,
                            padding_side="left",
                            use_fast=False  # Fallback to slow tokenizer
                        )
                    except Exception as slow_tokenizer_error:
                        logger.error(f"Both tokenizers failed: {slow_tokenizer_error}")
                        return False
                
                # Add pad token if missing
                if tokenizer.pad_token is None:
                    tokenizer.pad_token = tokenizer.eos_token
                
                # Create pipeline
                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    device=0 if torch.cuda.is_available() else -1,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
                )
                
                self.models[model_id] = {
                    "pipeline": pipe,
                    "tokenizer": tokenizer,
                    "model": model,
                    "type": "lora",
                    "base_model": base_model_id
                }
            
            else:
                logger.error(f"Unknown model type: {model_type}")
                return False
            
            self.current_model = model_id
            logger.info(f"Successfully loaded model: {model_id}")
            return True
            
        except Exception as e:
            logger.error(f"Failed to load model {model_id}: {e}")
            return False
    
    def generate_response(
        self,
        prompt: str,
        max_length: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9,
        do_sample: bool = True,
        model_id: Optional[str] = None
    ) -> str:
        """
        Generate response using current or specified model
        
        Args:
            prompt: Input prompt
            max_length: Maximum response length
            temperature: Sampling temperature
            top_p: Top-p sampling
            do_sample: Whether to use sampling
            model_id: Specific model to use (optional)
            
        Returns:
            Generated response text
        """
        try:
            # Use specified model or current model
            target_model = model_id or self.current_model
            
            if not target_model or target_model not in self.models:
                return "Error: No model loaded. Please select and load a model first."
            
            model_data = self.models[target_model]
            pipeline_obj = model_data["pipeline"]
            
            # Generate response
            logger.info(f"Generating response with {target_model}")
            
            # Prepare generation parameters
            generation_kwargs = {
                "max_length": max_length,
                "temperature": temperature,
                "top_p": top_p,
                "do_sample": do_sample,
                "pad_token_id": pipeline_obj.tokenizer.eos_token_id,
                "eos_token_id": pipeline_obj.tokenizer.eos_token_id,
                "return_full_text": False  # Only return generated text
            }
            
            # For local and LoRA models, we might need to format the prompt differently
            if model_data["type"] in ["local", "lora"]:
                # Some models work better with specific formatting
                if "llama" in target_model.lower():
                    formatted_prompt = f"<s>[INST] {prompt} [/INST]"
                elif "mistral" in target_model.lower() or model_data["type"] == "lora":
                    # For LoRA models (especially Maya), use Mistral format since base is Mistral
                    formatted_prompt = f"<s>[INST] {prompt} [/INST]"
                else:
                    formatted_prompt = prompt
            elif target_model in ["blakeurmos/maya-7b-lora-v1", "mistralai/Mistral-7B-Instruct-v0.1"]:
                # Maya models always need Mistral format (even via inference API)
                formatted_prompt = f"<s>[INST] {prompt} [/INST]"
            else:
                formatted_prompt = prompt
            
            # Generate
            results = pipeline_obj(formatted_prompt, **generation_kwargs)
            
            if isinstance(results, list) and len(results) > 0:
                response = results[0].get("generated_text", "")
            else:
                response = str(results)
            
            # Clean up response
            response = response.strip()
            
            # Remove the original prompt if it was included
            if response.startswith(formatted_prompt):
                response = response[len(formatted_prompt):].strip()
            
            logger.info(f"Generated response length: {len(response)}")
            return response
            
        except Exception as e:
            logger.error(f"Failed to generate response: {e}")
            return f"Error generating response: {str(e)}"
    
    def unload_model(self, model_id: str = None):
        """Unload a specific model or current model"""
        target_model = model_id or self.current_model
        
        if target_model and target_model in self.models:
            del self.models[target_model]
            if self.current_model == target_model:
                self.current_model = None
            logger.info(f"Unloaded model: {target_model}")
            
            # Clear GPU cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    def get_model_info(self, model_id: str = None) -> Dict[str, Any]:
        """Get information about current or specified model"""
        target_model = model_id or self.current_model
        
        if not target_model:
            return {"error": "No model specified or loaded"}
        
        model_config = self.available_models.get(target_model, {})
        is_loaded = target_model in self.models
        
        info = {
            "model_id": target_model,
            "name": model_config.get("name", target_model),
            "description": model_config.get("description", ""),
            "size": model_config.get("size", "Unknown"),
            "type": model_config.get("type", "unknown"),
            "is_loaded": is_loaded,
            "is_current": target_model == self.current_model
        }
        
        if is_loaded:
            model_data = self.models[target_model]
            info["device"] = str(next(model_data["pipeline"].model.parameters()).device) if hasattr(model_data["pipeline"], "model") else "unknown"
        
        return info
    
    def list_loaded_models(self) -> List[str]:
        """Get list of currently loaded models"""
        return list(self.models.keys())
    
    def get_memory_usage(self) -> Dict[str, Any]:
        """Get current memory usage information"""
        info = {
            "device": self.device,
            "loaded_models": len(self.models),
            "current_model": self.current_model
        }
        
        if torch.cuda.is_available():
            info["cuda_memory_allocated"] = f"{torch.cuda.memory_allocated() / 1024**3:.2f} GB"
            info["cuda_memory_reserved"] = f"{torch.cuda.memory_reserved() / 1024**3:.2f} GB"
        
        return info