"""
HuggingFace ZeroGPU-optimized client for Felix Framework on HF Spaces.

This module provides advanced HuggingFace integration optimized for ZeroGPU acceleration,
HF Pro account features, and HF Spaces deployment while maintaining full API compatibility
with LMStudioClient.

ZeroGPU Features:
- Dynamic GPU allocation with @spaces.GPU decorator support
- GPU memory management and automatic cleanup
- Batch processing for multiple agents with GPU acceleration
- Model loading with torch.cuda optimization
- Efficient device allocation and deallocation

HF Pro Account Features:
- Higher rate limits and premium model access
- Priority inference queue for Pro accounts
- Advanced model configurations and fine-tuning support
- Extended quota management

Agent-Model Mapping (ZeroGPU Optimized):
- ResearchAgent: Fast 7B models (e.g., microsoft/DialoGPT-large, Qwen/Qwen2.5-7B-Instruct)
- AnalysisAgent: Reasoning 13B models (e.g., microsoft/DialoGPT-large, meta-llama/Llama-3.1-8B-Instruct)
- SynthesisAgent: High-quality 13B models (e.g., meta-llama/Llama-3.1-13B-Instruct)
- CriticAgent: Specialized validation models (e.g., microsoft/DialoGPT-medium)

LMStudioClient Compatibility:
- Drop-in replacement maintaining identical API
- Same method signatures and response objects
- Existing Felix agent system integration preserved
"""

import asyncio
import logging
import time
import os
import gc
from typing import Dict, List, Optional, Any, Union
from dataclasses import dataclass
from enum import Enum
import aiohttp
import json
from datetime import datetime, timedelta
from collections import deque

# ZeroGPU and HF integration imports
try:
    import spaces
    ZEROGPU_AVAILABLE = True
except ImportError:
    ZEROGPU_AVAILABLE = False
    # Mock decorator for non-ZeroGPU environments
    class MockSpaces:
        @staticmethod
        def GPU(fn):
            return fn
    spaces = MockSpaces()

try:
    import torch
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False

from huggingface_hub import HfApi, InferenceClient
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False

from .token_budget import TokenBudgetManager, TokenAllocation
from .lm_studio_client import RequestPriority, LLMResponse

logger = logging.getLogger(__name__)


class ModelType(Enum):
    """Model specialization types for different agent functions."""
    RESEARCH = "research"
    ANALYSIS = "analysis"
    SYNTHESIS = "synthesis"
    CRITIC = "critic"
    GENERAL = "general"


class GPUMemoryError(Exception):
    """Raised when GPU memory allocation fails."""
    pass


class ZeroGPUError(Exception):
    """Raised when ZeroGPU operations fail."""
    pass


class HuggingFaceConnectionError(Exception):
    """Raised when cannot connect to HuggingFace services."""
    pass


@dataclass
class HFModelConfig:
    """Configuration for a HuggingFace model with ZeroGPU optimization."""
    model_id: str
    max_tokens: int = 512
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    use_cache: bool = True
    wait_for_model: bool = True
    # ZeroGPU specific settings
    use_zerogpu: bool = True
    gpu_memory_limit: Optional[float] = None  # GB, None for auto
    torch_dtype: str = "float16"  # torch dtype for GPU efficiency
    device_map: str = "auto"
    batch_size: int = 1
    # HF Pro settings
    priority: str = "normal"  # normal, high for Pro accounts
    use_inference_api: bool = True  # Fallback to Inference API
    local_model_path: Optional[str] = None


@dataclass
class HFResponse:
    """Response from HuggingFace inference API with GPU metrics."""
    content: str
    model_used: str
    tokens_used: int
    response_time: float
    success: bool
    error: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None
    # ZeroGPU specific metrics
    gpu_memory_used: Optional[float] = None
    gpu_time: Optional[float] = None
    batch_processed: Optional[int] = None
    fallback_used: bool = False


class HuggingFaceClient:
    """
    HuggingFace Inference API client for Felix Framework.

    Provides model inference capabilities with token budget management,
    rate limiting, and agent specialization support.
    """

    # ZeroGPU optimized models for different agent types
    DEFAULT_MODELS = {
        ModelType.RESEARCH: HFModelConfig(
            model_id="microsoft/DialoGPT-large",  # Upgraded for ZeroGPU
            temperature=0.9,
            max_tokens=384,
            use_zerogpu=True,
            batch_size=2,  # Can process multiple research queries
            torch_dtype="float16"
        ),
        ModelType.ANALYSIS: HFModelConfig(
            model_id="meta-llama/Llama-3.1-8B-Instruct",  # Better reasoning
            temperature=0.5,
            max_tokens=512,
            use_zerogpu=True,
            batch_size=1,
            torch_dtype="float16",
            priority="high"  # Pro account priority for analysis
        ),
        ModelType.SYNTHESIS: HFModelConfig(
            model_id="Qwen/Qwen2.5-7B-Instruct",  # ZeroGPU-compatible synthesis (fits in 24GB)
            temperature=0.1,
            max_tokens=768,
            use_zerogpu=True,
            batch_size=1,
            torch_dtype="float16",
            gpu_memory_limit=8.0,  # 7B model fits comfortably
            priority="high"
        ),
        ModelType.CRITIC: HFModelConfig(
            model_id="microsoft/DialoGPT-large",
            temperature=0.3,
            max_tokens=384,
            use_zerogpu=True,
            batch_size=2,
            torch_dtype="float16"
        ),
        ModelType.GENERAL: HFModelConfig(
            model_id="Qwen/Qwen2.5-7B-Instruct",  # Good general purpose ZeroGPU model
            temperature=0.7,
            max_tokens=512,
            use_zerogpu=True,
            batch_size=1,
            torch_dtype="float16"
        )
    }

    def __init__(self,
                 hf_token: Optional[str] = None,
                 model_configs: Optional[Dict[ModelType, HFModelConfig]] = None,
                 token_budget_manager: Optional[TokenBudgetManager] = None,
                 max_concurrent_requests: int = 10,
                 request_timeout: float = 30.0,
                 # ZeroGPU specific parameters
                 enable_zerogpu: bool = True,
                 gpu_memory_threshold: float = 0.9,  # Trigger cleanup at 90% memory
                 batch_timeout: float = 5.0,  # Max wait time for batching
                 # LMStudioClient compatibility
                 base_url: Optional[str] = None,  # For API compatibility
                 timeout: Optional[float] = None,  # Alternative name for request_timeout
                 debug_mode: bool = False):
        """
        Initialize HuggingFace ZeroGPU-optimized client.

        Args:
            hf_token: HuggingFace API token (uses HF_TOKEN env var if None)
            model_configs: Custom model configurations by agent type
            token_budget_manager: Token budget manager for rate limiting
            max_concurrent_requests: Maximum concurrent API requests
            request_timeout: Request timeout in seconds
            enable_zerogpu: Enable ZeroGPU acceleration when available
            gpu_memory_threshold: GPU memory usage threshold for cleanup
            batch_timeout: Maximum wait time for request batching
            base_url: API base URL (for LMStudioClient compatibility)
            timeout: Request timeout (alternative parameter name)
            debug_mode: Enable verbose debug output
        """
        # API compatibility with LMStudioClient
        self.hf_token = hf_token or os.getenv('HF_TOKEN')
        self.base_url = base_url  # For compatibility (not used)
        self.timeout = timeout or request_timeout
        self.request_timeout = self.timeout
        self.debug_mode = debug_mode

        # Core configuration
        self.model_configs = model_configs or self.DEFAULT_MODELS
        self.token_budget_manager = token_budget_manager or TokenBudgetManager()
        self.max_concurrent_requests = max_concurrent_requests

        # ZeroGPU configuration
        self.enable_zerogpu = enable_zerogpu and ZEROGPU_AVAILABLE
        self.gpu_memory_threshold = gpu_memory_threshold
        self.batch_timeout = batch_timeout

        # Initialize HF clients
        self.hf_api = HfApi(token=self.hf_token)
        self.inference_clients = {}
        self._init_inference_clients()

        # ZeroGPU model management
        self.loaded_models = {}  # Cache for loaded GPU models
        self.model_pipelines = {}  # Transformers pipelines
        self.gpu_memory_usage = 0.0

        # Rate limiting and performance tracking
        self.semaphore = asyncio.Semaphore(max_concurrent_requests)
        self.request_counts = {}
        self.response_times = []
        self.error_counts = {}

        # Batch processing for ZeroGPU efficiency
        self.batch_queue = deque()
        self.batch_processor_task = None

        # LMStudioClient compatibility tracking
        self.total_tokens = 0
        self.total_requests = 0
        self.total_response_time = 0.0
        self.concurrent_requests = 0
        self._connection_verified = False

        # Session management
        self.session: Optional[aiohttp.ClientSession] = None

        # Initialize if ZeroGPU available
        if self.enable_zerogpu:
            self._initialize_zerogpu()

    def _init_inference_clients(self):
        """Initialize inference clients for each model type."""
        for model_type, config in self.model_configs.items():
            try:
                client = InferenceClient(
                    model=config.model_id,
                    token=self.hf_token
                )
                self.inference_clients[model_type] = client
                logger.info(f"Initialized inference client for {model_type.value}: {config.model_id}")
            except Exception as e:
                logger.error(f"Failed to initialize client for {model_type.value}: {e}")
                # Fall back to general model
                if model_type != ModelType.GENERAL:
                    self.inference_clients[model_type] = self.inference_clients.get(ModelType.GENERAL)

    def _initialize_zerogpu(self):
        """Initialize ZeroGPU environment and check availability."""
        if not ZEROGPU_AVAILABLE:
            logger.warning("ZeroGPU not available, falling back to Inference API")
            return

        if TORCH_AVAILABLE and torch.cuda.is_available():
            logger.info(f"ZeroGPU initialized with {torch.cuda.device_count()} GPUs")
            for i in range(torch.cuda.device_count()):
                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        else:
            logger.warning("CUDA not available, ZeroGPU features disabled")
            self.enable_zerogpu = False

    # LMStudioClient Compatibility Methods

    def test_connection(self) -> bool:
        """
        Test connection to HuggingFace services.

        Returns:
            True if connection successful, False otherwise
        """
        try:
            # Test with a simple API call
            models = self.hf_api.list_models(limit=1)
            self._connection_verified = True
            return True
        except Exception as e:
            logger.warning(f"HuggingFace connection test failed: {e}")
            self._connection_verified = False
            return False

    def ensure_connection(self) -> None:
        """Ensure connection to HuggingFace or raise exception."""
        if not self._connection_verified and not self.test_connection():
            raise HuggingFaceConnectionError(
                "Cannot connect to HuggingFace services. "
                "Check your internet connection and HF_TOKEN."
            )

    def complete(self, agent_id: str, system_prompt: str, user_prompt: str,
                 temperature: float = 0.7, max_tokens: Optional[int] = 500,
                 model: str = "local-model") -> LLMResponse:
        """
        Synchronous completion request (LMStudioClient compatibility).

        Args:
            agent_id: Identifier for the requesting agent
            system_prompt: System/context prompt
            user_prompt: User query/task
            temperature: Sampling temperature (0.0-1.0)
            max_tokens: Maximum tokens in response
            model: Model identifier (mapped to agent type)

        Returns:
            LLMResponse with content and metadata

        Raises:
            HuggingFaceConnectionError: If cannot connect to HuggingFace
        """
        # Run async method synchronously (check for existing loop)
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        try:
            # Map model to agent type
            agent_type = self._map_model_to_agent_type(model, agent_id)

            # Create combined prompt
            combined_prompt = f"System: {system_prompt}\n\nUser: {user_prompt}"

            result = loop.run_until_complete(
                self.generate_text(
                    prompt=combined_prompt,
                    agent_type=agent_type,
                    temperature=temperature,
                    max_tokens=max_tokens
                )
            )

            # Convert HFResponse to LLMResponse for compatibility
            return LLMResponse(
                content=result.content,
                tokens_used=result.tokens_used,
                response_time=result.response_time,
                model=result.model_used,
                temperature=temperature,
                agent_id=agent_id,
                timestamp=time.time()
            )
        finally:
            loop.close()

    async def complete_async(self, agent_id: str, system_prompt: str,
                           user_prompt: str, temperature: float = 0.7,
                           max_tokens: Optional[int] = None,
                           model: str = "local-model",
                           priority: RequestPriority = RequestPriority.NORMAL) -> LLMResponse:
        """
        Asynchronous completion request (LMStudioClient compatibility).

        Args:
            agent_id: Identifier for the requesting agent
            system_prompt: System/context prompt
            user_prompt: User query/task
            temperature: Sampling temperature (0.0-1.0)
            max_tokens: Maximum tokens in response
            model: Model identifier
            priority: Request priority level

        Returns:
            LLMResponse with content and metadata
        """
        # Map model to agent type
        agent_type = self._map_model_to_agent_type(model, agent_id)

        # Create combined prompt
        combined_prompt = f"System: {system_prompt}\n\nUser: {user_prompt}"

        result = await self.generate_text(
            prompt=combined_prompt,
            agent_type=agent_type,
            temperature=temperature,
            max_tokens=max_tokens
        )

        # Convert HFResponse to LLMResponse for compatibility
        return LLMResponse(
            content=result.content,
            tokens_used=result.tokens_used,
            response_time=result.response_time,
            model=result.model_used,
            temperature=temperature,
            agent_id=agent_id,
            timestamp=time.time()
        )

    def _map_model_to_agent_type(self, model: str, agent_id: str) -> ModelType:
        """Map model identifier to agent type for compatibility."""
        # Try to infer from agent_id first
        agent_id_lower = agent_id.lower()
        if "research" in agent_id_lower:
            return ModelType.RESEARCH
        elif "analysis" in agent_id_lower or "analyze" in agent_id_lower:
            return ModelType.ANALYSIS
        elif "synthesis" in agent_id_lower or "synthesize" in agent_id_lower:
            return ModelType.SYNTHESIS
        elif "critic" in agent_id_lower or "critique" in agent_id_lower:
            return ModelType.CRITIC

        # Try to infer from model name
        model_lower = model.lower()
        if "research" in model_lower:
            return ModelType.RESEARCH
        elif "analysis" in model_lower or "thinking" in model_lower:
            return ModelType.ANALYSIS
        elif "synthesis" in model_lower or "quality" in model_lower:
            return ModelType.SYNTHESIS
        elif "critic" in model_lower:
            return ModelType.CRITIC

        return ModelType.GENERAL

    def get_usage_stats(self) -> Dict[str, Any]:
        """
        Get client usage statistics (LMStudioClient compatibility).

        Returns:
            Dictionary with usage metrics
        """
        avg_response_time = (self.total_response_time / self.total_requests
                           if self.total_requests > 0 else 0.0)

        return {
            "total_requests": self.total_requests,
            "total_tokens": self.total_tokens,
            "total_response_time": self.total_response_time,
            "average_response_time": avg_response_time,
            "average_tokens_per_request": (self.total_tokens / self.total_requests
                                         if self.total_requests > 0 else 0.0),
            "connection_verified": self._connection_verified,
            "max_concurrent_requests": self.max_concurrent_requests,
            "current_concurrent_requests": self.concurrent_requests,
            "queue_size": len(self.batch_queue),
            # ZeroGPU specific stats
            "zerogpu_enabled": self.enable_zerogpu,
            "gpu_memory_usage": self.gpu_memory_usage,
            "loaded_models": list(self.loaded_models.keys())
        }

    def reset_stats(self) -> None:
        """Reset usage statistics (LMStudioClient compatibility)."""
        self.total_tokens = 0
        self.total_requests = 0
        self.total_response_time = 0.0
        self.reset_performance_stats()

    def create_agent_system_prompt(self, agent_type: str, position_info: Dict[str, float],
                                 task_context: str = "") -> str:
        """
        Create system prompt for Felix agent based on position and type (LMStudioClient compatibility).

        Args:
            agent_type: Type of agent (research, analysis, synthesis, critic)
            position_info: Agent's position on helix (x, y, z, radius, depth_ratio)
            task_context: Additional context about the current task

        Returns:
            Formatted system prompt
        """
        # Use the same implementation as LMStudioClient but optimized for ZeroGPU models
        depth_ratio = position_info.get("depth_ratio", 0.0)
        radius = position_info.get("radius", 0.0)

        base_prompt = f"""🚨 IMPORTANT: You are a {agent_type} agent in the Felix multi-agent system optimized for ZeroGPU inference.

⚡ ZeroGPU OPTIMIZATION: This response will be processed on GPU-accelerated infrastructure for optimal performance.

Current Position:
- Depth: {depth_ratio:.2f} (0.0 = top/start, 1.0 = bottom/end)
- Radius: {radius:.2f} (decreasing as you progress)
- Processing Stage: {"Early/Broad" if depth_ratio < 0.3 else "Middle/Focused" if depth_ratio < 0.7 else "Final/Precise"}

Your Role Based on Position:
"""

        if agent_type == "research":
            if depth_ratio < 0.3:
                base_prompt += "- MAXIMUM 5 bullet points with key facts ONLY\n"
                base_prompt += "- NO explanations, NO introductions, NO conclusions\n"
                base_prompt += "- Raw findings only - be direct\n"
            else:
                base_prompt += "- MAXIMUM 3 specific facts with numbers/dates/quotes\n"
                base_prompt += "- NO background context or elaboration\n"
                base_prompt += "- Prepare key points for analysis (concise)\n"

        elif agent_type == "analysis":
            base_prompt += "- MAXIMUM 2 numbered insights/patterns ONLY\n"
            base_prompt += "- NO background explanation or context\n"
            base_prompt += "- Direct analytical findings only\n"

        elif agent_type == "synthesis":
            base_prompt += "- FINAL output ONLY - NO process description\n"
            base_prompt += "- MAXIMUM 3 short paragraphs\n"
            base_prompt += "- Direct, actionable content without fluff\n"

        elif agent_type == "critic":
            base_prompt += "- MAXIMUM 3 specific issues/fixes ONLY\n"
            base_prompt += "- NO praise, NO general comments\n"
            base_prompt += "- Direct problems and solutions only\n"

        if task_context:
            base_prompt += f"\nTask Context: {task_context}\n"

        base_prompt += "\n⚡ ZeroGPU REMINDER: Response optimized for GPU acceleration. "
        base_prompt += "Early positions focus on breadth, later positions focus on depth and precision. BE CONCISE!"

        return base_prompt

    # ZeroGPU-specific methods

    async def _zerogpu_inference(self, model_id: str, prompt: str,
                               generation_params: Dict[str, Any]) -> Dict[str, Any]:
        """
        ZeroGPU-accelerated inference using direct model loading.

        Args:
            model_id: HuggingFace model identifier
            prompt: Input text prompt
            generation_params: Generation parameters

        Returns:
            Generation result with GPU metrics
        """
        if not (TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE):
            raise ZeroGPUError("PyTorch and Transformers required for ZeroGPU inference")

        gpu_start_time = time.time()
        initial_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

        try:
            # Load or get cached model
            if model_id not in self.loaded_models:
                await self._load_model_to_gpu(model_id, generation_params)

            model, tokenizer = self.loaded_models[model_id]

            # Tokenize input
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=2048
            ).to(model.device)

            # Generate with GPU acceleration
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=generation_params.get("max_new_tokens", 512),
                    temperature=generation_params.get("temperature", 0.7),
                    top_p=generation_params.get("top_p", 0.9),
                    do_sample=generation_params.get("do_sample", True),
                    pad_token_id=tokenizer.eos_token_id,
                    repetition_penalty=generation_params.get("repetition_penalty", 1.1)
                )

            # Decode response
            input_length = inputs['input_ids'].shape[1]
            generated_tokens = outputs[0][input_length:]
            response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

            # Calculate metrics
            gpu_end_time = time.time()
            final_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

            return {
                "generated_text": response_text,
                "gpu_time": gpu_end_time - gpu_start_time,
                "gpu_memory_used": (final_memory - initial_memory) / 1024**3,  # GB
                "tokens_generated": len(generated_tokens)
            }

        except Exception as e:
            logger.error(f"ZeroGPU inference failed for {model_id}: {e}")
            # Cleanup on error
            await self._cleanup_gpu_memory()
            raise ZeroGPUError(f"GPU inference failed: {e}")

    async def _load_model_to_gpu(self, model_id: str, generation_params: Dict[str, Any]):
        """Load model to GPU with memory management."""
        if not torch.cuda.is_available():
            raise ZeroGPUError("CUDA not available for model loading")

        try:
            # Check available memory
            available_memory = torch.cuda.get_device_properties(0).total_memory
            if self.gpu_memory_usage > self.gpu_memory_threshold * available_memory:
                await self._cleanup_gpu_memory()

            # Load tokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token

            # Load model with optimal settings
            torch_dtype = getattr(torch, generation_params.get("torch_dtype", "float16"))

            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch_dtype,
                device_map="auto",
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )

            # Cache the loaded model
            self.loaded_models[model_id] = (model, tokenizer)

            # Update memory usage tracking
            current_memory = torch.cuda.memory_allocated()
            self.gpu_memory_usage = current_memory

            logger.info(f"Loaded {model_id} to GPU, memory usage: {current_memory / 1024**3:.2f} GB")

        except Exception as e:
            logger.error(f"Failed to load {model_id} to GPU: {e}")
            raise ZeroGPUError(f"Model loading failed: {e}")

    async def _cleanup_gpu_memory(self):
        """Clean up GPU memory by unloading models."""
        if not torch.cuda.is_available():
            return

        # Clear model cache
        for model_id in list(self.loaded_models.keys()):
            model, tokenizer = self.loaded_models.pop(model_id)
            del model, tokenizer

        # Force garbage collection
        gc.collect()
        torch.cuda.empty_cache()

        self.gpu_memory_usage = 0.0
        logger.info("GPU memory cleaned up")

    async def __aenter__(self):
        """Async context manager entry."""
        self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.request_timeout))
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit."""
        if self.session:
            await self.session.close()

        # Cleanup GPU resources
        if self.enable_zerogpu:
            await self._cleanup_gpu_memory()

    async def close_async(self) -> None:
        """Close async client and cleanup resources (LMStudioClient compatibility)."""
        if self.session:
            await self.session.close()

        if self.enable_zerogpu:
            await self._cleanup_gpu_memory()

        if self.batch_processor_task and not self.batch_processor_task.done():
            self.batch_processor_task.cancel()
            try:
                await self.batch_processor_task
            except asyncio.CancelledError:
                pass

    async def generate_text(self,
                          prompt: str,
                          agent_type: ModelType = ModelType.GENERAL,
                          temperature: Optional[float] = None,
                          max_tokens: Optional[int] = None,
                          use_zerogpu: Optional[bool] = None,
                          priority: RequestPriority = RequestPriority.NORMAL,
                          **kwargs) -> HFResponse:
        """
        Generate text using HuggingFace inference with ZeroGPU optimization.

        Args:
            prompt: Input prompt for text generation
            agent_type: Type of agent requesting generation
            temperature: Override temperature for this request
            max_tokens: Override max tokens for this request
            use_zerogpu: Force ZeroGPU usage (None for auto-detect)
            priority: Request priority for processing order
            **kwargs: Additional generation parameters

        Returns:
            HFResponse with generated text and metadata
        """
        async with self.semaphore:
            start_time = time.time()
            self.concurrent_requests += 1

            try:
                # Get model configuration
                config = self.model_configs.get(agent_type, self.model_configs[ModelType.GENERAL])
                client = self.inference_clients.get(agent_type, self.inference_clients[ModelType.GENERAL])

                # Determine if we should use ZeroGPU
                should_use_zerogpu = (
                    use_zerogpu if use_zerogpu is not None
                    else (self.enable_zerogpu and config.use_zerogpu)
                )

                if not client and not should_use_zerogpu:
                    return HFResponse(
                        content="",
                        model_used=config.model_id,
                        tokens_used=0,
                        response_time=0.0,
                        success=False,
                        error=f"No inference client available for {agent_type.value}",
                        fallback_used=False
                    )

                # Check token budget
                estimated_tokens = max_tokens or config.max_tokens
                if hasattr(self.token_budget_manager, 'can_allocate') and not self.token_budget_manager.can_allocate(estimated_tokens):
                    return HFResponse(
                        content="",
                        model_used=config.model_id,
                        tokens_used=0,
                        response_time=time.time() - start_time,
                        success=False,
                        error="Insufficient token budget",
                        fallback_used=False
                    )

                # Prepare generation parameters
                generation_params = {
                    "max_new_tokens": max_tokens or config.max_tokens,
                    "temperature": temperature or config.temperature,
                    "top_p": config.top_p,
                    "repetition_penalty": config.repetition_penalty,
                    "do_sample": True,
                    "return_full_text": False,
                    "torch_dtype": config.torch_dtype,
                    **kwargs
                }

                response_data = None
                gpu_metrics = {}
                fallback_used = False

                # Try ZeroGPU first if enabled and available
                if should_use_zerogpu:
                    try:
                        if self.debug_mode:
                            logger.info(f"Using ZeroGPU inference for {agent_type.value} with {config.model_id}")

                        gpu_result = await self._zerogpu_inference(
                            config.model_id, prompt, generation_params
                        )

                        response_data = [{
                            "generated_text": gpu_result["generated_text"]
                        }]

                        gpu_metrics = {
                            "gpu_time": gpu_result["gpu_time"],
                            "gpu_memory_used": gpu_result["gpu_memory_used"],
                            "tokens_generated": gpu_result["tokens_generated"]
                        }

                    except (ZeroGPUError, GPUMemoryError) as e:
                        logger.warning(f"ZeroGPU failed, falling back to Inference API: {e}")
                        fallback_used = True
                        should_use_zerogpu = False

                # Fallback to Inference API if ZeroGPU failed or not enabled
                if not response_data:
                    if not client:
                        raise Exception("No inference method available")

                    response_data = await self._make_inference_request(
                        client=client,
                        prompt=prompt,
                        parameters=generation_params
                    )
                    fallback_used = not should_use_zerogpu

                # Process response
                if response_data and isinstance(response_data, list) and len(response_data) > 0:
                    generated_text = response_data[0].get("generated_text", "")
                    tokens_used = self._estimate_tokens(prompt + generated_text)

                    # Allocate tokens if budget manager supports it
                    allocation = None
                    if hasattr(self.token_budget_manager, 'allocate_tokens'):
                        allocation = self.token_budget_manager.allocate_tokens(tokens_used)

                    # Track performance
                    response_time = time.time() - start_time
                    self._track_performance(agent_type, response_time, success=True)

                    # Update compatibility stats
                    self.total_tokens += tokens_used
                    self.total_requests += 1
                    self.total_response_time += response_time

                    if self.debug_mode:
                        method = "ZeroGPU" if (should_use_zerogpu and not fallback_used) else "Inference API"
                        logger.info(f"✅ {method} response for {agent_type.value}: {len(generated_text)} chars, {tokens_used} tokens, {response_time:.2f}s")

                    return HFResponse(
                        content=generated_text,
                        model_used=config.model_id,
                        tokens_used=tokens_used,
                        response_time=response_time,
                        success=True,
                        metadata={
                            "agent_type": agent_type.value,
                            "allocation_id": allocation.allocation_id if allocation else None,
                            "parameters": generation_params,
                            "method": "ZeroGPU" if (should_use_zerogpu and not fallback_used) else "Inference API"
                        },
                        gpu_memory_used=gpu_metrics.get("gpu_memory_used"),
                        gpu_time=gpu_metrics.get("gpu_time"),
                        fallback_used=fallback_used
                    )
                else:
                    return HFResponse(
                        content="",
                        model_used=config.model_id,
                        tokens_used=0,
                        response_time=time.time() - start_time,
                        success=False,
                        error="Empty or invalid response from API",
                        fallback_used=fallback_used
                    )

            except Exception as e:
                self._track_performance(agent_type, time.time() - start_time, success=False)
                logger.error(f"HF API request failed for {agent_type.value}: {e}")

                return HFResponse(
                    content="",
                    model_used=config.model_id,
                    tokens_used=0,
                    response_time=time.time() - start_time,
                    success=False,
                    error=str(e),
                    fallback_used=False
                )

            finally:
                self.concurrent_requests -= 1

    async def _make_inference_request(self, client: InferenceClient, prompt: str, parameters: Dict[str, Any]):
        """Make inference request with proper error handling and Pro account optimizations."""
        try:
            # Remove ZeroGPU-specific parameters for Inference API
            api_params = parameters.copy()
            api_params.pop('torch_dtype', None)

            # Use text generation task with Pro account optimizations
            response = await asyncio.wait_for(
                asyncio.create_task(
                    client.text_generation(
                        prompt=prompt,
                        **api_params
                    )
                ),
                timeout=self.request_timeout
            )
            return [{"generated_text": response}] if isinstance(response, str) else response

        except asyncio.TimeoutError:
            raise Exception(f"Request timeout after {self.request_timeout}s")
        except Exception as e:
            raise Exception(f"Inference request failed: {e}")

    def _estimate_tokens(self, text: str) -> int:
        """Estimate token count for text (rough approximation)."""
        # Simple approximation: ~4 characters per token on average
        return max(1, len(text) // 4)

    def _track_performance(self, agent_type: ModelType, response_time: float, success: bool):
        """Track performance metrics for monitoring."""
        # Track request counts
        self.request_counts[agent_type] = self.request_counts.get(agent_type, 0) + 1

        # Track response times
        self.response_times.append(response_time)
        if len(self.response_times) > 1000:  # Keep last 1000 responses
            self.response_times = self.response_times[-1000:]

        # Track errors
        if not success:
            self.error_counts[agent_type] = self.error_counts.get(agent_type, 0) + 1

    def get_performance_stats(self) -> Dict[str, Any]:
        """Get performance statistics with ZeroGPU metrics."""
        avg_response_time = sum(self.response_times) / len(self.response_times) if self.response_times else 0
        total_requests = sum(self.request_counts.values())
        total_errors = sum(self.error_counts.values())
        error_rate = (total_errors / total_requests) if total_requests > 0 else 0

        # ZeroGPU specific stats
        zerogpu_stats = {}
        if self.enable_zerogpu and torch.cuda.is_available():
            zerogpu_stats = {
                "gpu_available": True,
                "gpu_count": torch.cuda.device_count(),
                "gpu_memory_allocated": torch.cuda.memory_allocated() / 1024**3,  # GB
                "gpu_memory_cached": torch.cuda.memory_reserved() / 1024**3,  # GB
                "loaded_models": list(self.loaded_models.keys()),
                "current_gpu_memory_usage": self.gpu_memory_usage / 1024**3 if self.gpu_memory_usage else 0.0
            }

        base_stats = {
            "total_requests": total_requests,
            "total_errors": total_errors,
            "error_rate": error_rate,
            "avg_response_time": avg_response_time,
            "requests_by_type": dict(self.request_counts),
            "errors_by_type": dict(self.error_counts),
            "zerogpu_enabled": self.enable_zerogpu,
            "zerogpu_available": ZEROGPU_AVAILABLE,
        }

        if hasattr(self.token_budget_manager, 'get_status'):
            base_stats["token_budget_status"] = self.token_budget_manager.get_status()

        base_stats.update(zerogpu_stats)
        return base_stats

    def reset_performance_stats(self):
        """Reset performance tracking."""
        self.request_counts.clear()
        self.response_times.clear()
        self.error_counts.clear()

    async def health_check(self) -> Dict[str, bool]:
        """Check health of all configured models."""
        health_status = {}

        for model_type, config in self.model_configs.items():
            try:
                # Simple test request
                response = await self.generate_text(
                    prompt="Hello",
                    agent_type=model_type,
                    max_tokens=10
                )
                health_status[model_type.value] = response.success
            except Exception as e:
                logger.error(f"Health check failed for {model_type.value}: {e}")
                health_status[model_type.value] = False

        return health_status

    def get_available_models(self) -> Dict[str, str]:
        """Get list of available models by type."""
        return {
            model_type.value: config.model_id
            for model_type, config in self.model_configs.items()
        }

    async def batch_generate(self,
                           prompts: List[str],
                           agent_types: List[ModelType],
                           use_zerogpu_batching: bool = True,
                           **kwargs) -> List[HFResponse]:
        """
        Generate text for multiple prompts with ZeroGPU batching optimization.

        Args:
            prompts: List of input prompts
            agent_types: List of agent types (must match prompts length)
            use_zerogpu_batching: Enable ZeroGPU batch processing
            **kwargs: Additional generation parameters

        Returns:
            List of HFResponse objects
        """
        if len(prompts) != len(agent_types):
            raise ValueError("Prompts and agent_types lists must have same length")

        # Use ZeroGPU batching for same model types if enabled
        if use_zerogpu_batching and self.enable_zerogpu:
            try:
                return await self._zerogpu_batch_process(prompts, agent_types, **kwargs)
            except Exception as e:
                logger.warning(f"ZeroGPU batching failed, falling back to individual requests: {e}")

        # Create tasks for concurrent execution
        tasks = [
            self.generate_text(prompt=prompt, agent_type=agent_type, **kwargs)
            for prompt, agent_type in zip(prompts, agent_types)
        ]

        # Execute concurrently with semaphore limiting
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Convert exceptions to error responses
        processed_results = []
        for i, result in enumerate(results):
            if isinstance(result, Exception):
                processed_results.append(HFResponse(
                    content="",
                    model_used=self.model_configs[agent_types[i]].model_id,
                    tokens_used=0,
                    response_time=0.0,
                    success=False,
                    error=str(result)
                ))
            else:
                processed_results.append(result)

        return processed_results

    async def _zerogpu_batch_process(self, prompts: List[str], agent_types: List[ModelType], **kwargs) -> List[HFResponse]:
        """
        Process multiple prompts using ZeroGPU batching for efficiency.

        Args:
            prompts: List of input prompts
            agent_types: List of agent types
            **kwargs: Additional parameters

        Returns:
            List of HFResponse objects
        """
        # Group by model type for efficient batching
        model_groups = {}
        for i, (prompt, agent_type) in enumerate(zip(prompts, agent_types)):
            config = self.model_configs.get(agent_type, self.model_configs[ModelType.GENERAL])
            model_id = config.model_id

            if model_id not in model_groups:
                model_groups[model_id] = []
            model_groups[model_id].append((i, prompt, agent_type, config))

        # Process each model group with GPU batching
        results = [None] * len(prompts)
        start_time = time.time()

        for model_id, group_items in model_groups.items():
            batch_prompts = [item[1] for item in group_items]
            batch_configs = [item[3] for item in group_items]

            try:
                # Use first config as representative
                base_config = batch_configs[0]
                generation_params = {
                    "max_new_tokens": kwargs.get('max_tokens', base_config.max_tokens),
                    "temperature": kwargs.get('temperature', base_config.temperature),
                    "top_p": base_config.top_p,
                    "repetition_penalty": base_config.repetition_penalty,
                    "do_sample": True,
                    "torch_dtype": base_config.torch_dtype,
                }

                # Process batch on GPU
                batch_results = await self._zerogpu_batch_inference(
                    model_id, batch_prompts, generation_params
                )

                # Map results back to original positions
                for (orig_idx, prompt, agent_type, config), batch_result in zip(group_items, batch_results):
                    tokens_used = self._estimate_tokens(prompt + batch_result["generated_text"])
                    response_time = batch_result.get("response_time", time.time() - start_time)

                    results[orig_idx] = HFResponse(
                        content=batch_result["generated_text"],
                        model_used=model_id,
                        tokens_used=tokens_used,
                        response_time=response_time,
                        success=True,
                        metadata={
                            "agent_type": agent_type.value,
                            "method": "ZeroGPU-Batch",
                            "batch_size": len(batch_prompts)
                        },
                        gpu_memory_used=batch_result.get("gpu_memory_used"),
                        gpu_time=batch_result.get("gpu_time"),
                        batch_processed=len(batch_prompts),
                        fallback_used=False
                    )

            except Exception as e:
                # Fall back to individual processing for this model group
                logger.warning(f"Batch processing failed for {model_id}, using individual requests: {e}")
                for orig_idx, prompt, agent_type, config in group_items:
                    try:
                        individual_result = await self.generate_text(
                            prompt=prompt,
                            agent_type=agent_type,
                            use_zerogpu=False,  # Force Inference API fallback
                            **kwargs
                        )
                        results[orig_idx] = individual_result
                    except Exception as individual_e:
                        results[orig_idx] = HFResponse(
                            content="",
                            model_used=config.model_id,
                            tokens_used=0,
                            response_time=0.0,
                            success=False,
                            error=str(individual_e),
                            fallback_used=True
                        )

        return results

    async def _zerogpu_batch_inference(self, model_id: str, prompts: List[str], generation_params: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Process multiple prompts in a single ZeroGPU session for efficiency.

        Args:
            model_id: HuggingFace model identifier
            prompts: List of input prompts
            generation_params: Generation parameters

        Returns:
            List of generation results
        """
        if not (TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE):
            raise ZeroGPUError("PyTorch and Transformers required for batch ZeroGPU inference")

        gpu_start_time = time.time()
        initial_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

        try:
            # Load or get cached model
            if model_id not in self.loaded_models:
                await self._load_model_to_gpu(model_id, generation_params)

            model, tokenizer = self.loaded_models[model_id]
            results = []

            # Process prompts individually but in the same GPU session
            for i, prompt in enumerate(prompts):
                prompt_start = time.time()

                # Tokenize input
                inputs = tokenizer(
                    prompt,
                    return_tensors="pt",
                    truncation=True,
                    max_length=2048
                ).to(model.device)

                # Generate with GPU acceleration
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=generation_params.get("max_new_tokens", 512),
                        temperature=generation_params.get("temperature", 0.7),
                        top_p=generation_params.get("top_p", 0.9),
                        do_sample=generation_params.get("do_sample", True),
                        pad_token_id=tokenizer.eos_token_id,
                        repetition_penalty=generation_params.get("repetition_penalty", 1.1)
                    )

                # Decode response
                input_length = inputs['input_ids'].shape[1]
                generated_tokens = outputs[0][input_length:]
                response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                prompt_end = time.time()
                results.append({
                    "generated_text": response_text,
                    "response_time": prompt_end - prompt_start,
                    "tokens_generated": len(generated_tokens)
                })

            # Calculate overall GPU metrics
            gpu_end_time = time.time()
            final_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

            # Add GPU metrics to all results
            total_gpu_time = gpu_end_time - gpu_start_time
            gpu_memory_used = (final_memory - initial_memory) / 1024**3  # GB

            for result in results:
                result["gpu_time"] = total_gpu_time / len(results)  # Distribute GPU time
                result["gpu_memory_used"] = gpu_memory_used / len(results)  # Distribute memory usage

            return results

        except Exception as e:
            logger.error(f"ZeroGPU batch inference failed for {model_id}: {e}")
            # Cleanup on error
            await self._cleanup_gpu_memory()
            raise ZeroGPUError(f"GPU batch inference failed: {e}")


# Utility functions for Felix Framework integration

def create_felix_hf_client(token_budget: int = 50000,
                          concurrent_requests: int = 5,
                          enable_zerogpu: bool = True,
                          debug_mode: bool = False) -> HuggingFaceClient:
    """
    Create ZeroGPU-optimized HuggingFace client for Felix Framework deployment on HF Spaces.

    Args:
        token_budget: Total token budget for session
        concurrent_requests: Maximum concurrent requests
        enable_zerogpu: Enable ZeroGPU acceleration
        debug_mode: Enable debug logging

    Returns:
        Configured HuggingFaceClient instance optimized for ZeroGPU and HF Pro
    """
    # Create token manager with Felix-specific settings
    token_manager = TokenBudgetManager(
        base_budget=token_budget // 4,  # Distribute among typical 4 agent types
        strict_mode=True  # Enable for ZeroGPU efficiency
    )

    # ZeroGPU and HF Pro optimized model configurations
    felix_models = {
        ModelType.RESEARCH: HFModelConfig(
            model_id="microsoft/DialoGPT-large",  # Upgraded for better performance
            temperature=0.9,
            max_tokens=256,
            top_p=0.95,
            use_zerogpu=True,
            batch_size=2,  # Efficient batching for research queries
            torch_dtype="float16",
            priority="normal"
        ),
        ModelType.ANALYSIS: HFModelConfig(
            model_id="meta-llama/Llama-3.1-8B-Instruct",  # Better reasoning capability
            temperature=0.5,
            max_tokens=384,
            top_p=0.9,
            use_zerogpu=True,
            batch_size=1,
            torch_dtype="float16",
            priority="high"  # Pro account priority
        ),
        ModelType.SYNTHESIS: HFModelConfig(
            model_id="Qwen/Qwen2.5-7B-Instruct",  # ZeroGPU-compatible synthesis (fits in 24GB)
            temperature=0.1,
            max_tokens=512,
            top_p=0.85,
            use_zerogpu=True,
            batch_size=1,
            torch_dtype="float16",
            gpu_memory_limit=8.0,  # 7B model fits comfortably
            priority="high"
        ),
        ModelType.CRITIC: HFModelConfig(
            model_id="microsoft/DialoGPT-large",
            temperature=0.3,
            max_tokens=256,
            top_p=0.9,
            use_zerogpu=True,
            batch_size=2,
            torch_dtype="float16",
            priority="normal"
        )
    }

    return HuggingFaceClient(
        model_configs=felix_models,
        token_budget_manager=token_manager,
        max_concurrent_requests=concurrent_requests,
        request_timeout=45.0,  # Longer timeout for ZeroGPU model loading
        enable_zerogpu=enable_zerogpu,
        gpu_memory_threshold=0.85,  # Conservative memory management
        batch_timeout=3.0,  # Shorter batching timeout for responsiveness
        debug_mode=debug_mode
    )


def create_default_client(max_concurrent_requests: int = 4,
                         enable_zerogpu: bool = True) -> HuggingFaceClient:
    """Create ZeroGPU HuggingFace client with default settings (LMStudioClient compatibility)."""
    return create_felix_hf_client(
        concurrent_requests=max_concurrent_requests,
        enable_zerogpu=enable_zerogpu
    )


# Pro account specific optimizations
def get_pro_account_models() -> Dict[ModelType, HFModelConfig]:
    """
    Get model configurations optimized for HF Pro accounts with access to premium models.

    Returns:
        Dictionary of premium model configurations
    """
    return {
        ModelType.RESEARCH: HFModelConfig(
            model_id="meta-llama/Llama-3.1-8B-Instruct",  # Premium access
            temperature=0.9,
            max_tokens=384,
            use_zerogpu=True,
            batch_size=3,
            priority="high"
        ),
        ModelType.ANALYSIS: HFModelConfig(
            model_id="meta-llama/Llama-3.1-8B-Instruct",  # ZeroGPU-compatible analysis (fits in 24GB)
            temperature=0.5,
            max_tokens=512,
            use_zerogpu=True,
            batch_size=1,
            gpu_memory_limit=10.0,  # 8B model fits in ZeroGPU
            priority="high"
        ),
        ModelType.SYNTHESIS: HFModelConfig(
            model_id="Qwen/Qwen2.5-7B-Instruct",  # ZeroGPU-compatible synthesis (fits in 24GB)
            temperature=0.1,
            max_tokens=768,
            use_zerogpu=True,
            batch_size=1,
            gpu_memory_limit=8.0,  # 7B model fits in ZeroGPU
            priority="high"
        ),
        ModelType.CRITIC: HFModelConfig(
            model_id="meta-llama/Llama-3.1-8B-Instruct",
            temperature=0.3,
            max_tokens=384,
            use_zerogpu=True,
            batch_size=2,
            priority="high"
        )
    }


# ZeroGPU deployment helpers
def estimate_gpu_requirements(model_configs: Dict[ModelType, HFModelConfig]) -> Dict[str, float]:
    """
    Estimate GPU memory requirements for given model configurations.

    Args:
        model_configs: Model configurations to analyze

    Returns:
        Dictionary with memory estimates in GB
    """
    # Rough model size estimates (in GB)
    model_sizes = {
        "microsoft/DialoGPT-medium": 1.5,
        "microsoft/DialoGPT-large": 3.0,
        "meta-llama/Llama-3.1-8B-Instruct": 16.0,
        "meta-llama/Llama-3.1-13B-Instruct": 26.0,
        "meta-llama/Llama-3.1-70B-Instruct": 140.0,
        "Qwen/Qwen2.5-7B-Instruct": 14.0
    }

    requirements = {}
    total_memory = 0.0
    max_single_model = 0.0

    for agent_type, config in model_configs.items():
        model_memory = model_sizes.get(config.model_id, 8.0)  # Default 8GB
        requirements[f"{agent_type.value}_memory"] = model_memory
        total_memory += model_memory
        max_single_model = max(max_single_model, model_memory)

    requirements.update({
        "total_memory_if_all_loaded": total_memory,
        "max_single_model_memory": max_single_model,
        "recommended_gpu_memory": max_single_model * 1.5,  # 50% buffer
        "minimum_gpu_memory": max_single_model * 1.2  # 20% buffer
    })

    return requirements


# Export main classes and functions
__all__ = [
    'HuggingFaceClient',
    'HFResponse',
    'HFModelConfig',
    'ModelType',
    'GPUMemoryError',
    'ZeroGPUError',
    'HuggingFaceConnectionError',
    'create_felix_hf_client',
    'create_default_client',
    'get_pro_account_models',
    'estimate_gpu_requirements',
    'ZEROGPU_AVAILABLE',
    'TORCH_AVAILABLE',
    'TRANSFORMERS_AVAILABLE'
]