Spaces:

jkbennitt
/

felix-framework

Paused

felix-framework / src /llm /huggingface_client.py

jkbennitt

FIX: Resolve 7 critical HF Spaces deployment issues for production readiness

8d60b1e 6 months ago

56.6 kB

	"""
	HuggingFace ZeroGPU-optimized client for Felix Framework on HF Spaces.

	This module provides advanced HuggingFace integration optimized for ZeroGPU acceleration,
	HF Pro account features, and HF Spaces deployment while maintaining full API compatibility
	with LMStudioClient.

	ZeroGPU Features:
	- Dynamic GPU allocation with @spaces.GPU decorator support
	- GPU memory management and automatic cleanup
	- Batch processing for multiple agents with GPU acceleration
	- Model loading with torch.cuda optimization
	- Efficient device allocation and deallocation

	HF Pro Account Features:
	- Higher rate limits and premium model access
	- Priority inference queue for Pro accounts
	- Advanced model configurations and fine-tuning support
	- Extended quota management

	Agent-Model Mapping (ZeroGPU Optimized):
	- ResearchAgent: Fast 7B models (e.g., microsoft/DialoGPT-large, Qwen/Qwen2.5-7B-Instruct)
	- AnalysisAgent: Reasoning 13B models (e.g., microsoft/DialoGPT-large, meta-llama/Llama-3.1-8B-Instruct)
	- SynthesisAgent: High-quality 13B models (e.g., meta-llama/Llama-3.1-13B-Instruct)
	- CriticAgent: Specialized validation models (e.g., microsoft/DialoGPT-medium)

	LMStudioClient Compatibility:
	- Drop-in replacement maintaining identical API
	- Same method signatures and response objects
	- Existing Felix agent system integration preserved
	"""

	import asyncio
	import logging
	import time
	import os
	import gc
	from typing import Dict, List, Optional, Any, Union
	from dataclasses import dataclass
	from enum import Enum
	import aiohttp
	import json
	from datetime import datetime, timedelta
	from collections import deque

	# ZeroGPU and HF integration imports
	try:
	import spaces
	ZEROGPU_AVAILABLE = True
	except ImportError:
	ZEROGPU_AVAILABLE = False
	# Mock decorator for non-ZeroGPU environments
	class MockSpaces:
	@staticmethod
	def GPU(fn):
	return fn
	spaces = MockSpaces()

	try:
	import torch
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False

	from huggingface_hub import HfApi, InferenceClient
	try:
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	TRANSFORMERS_AVAILABLE = True
	except ImportError:
	TRANSFORMERS_AVAILABLE = False

	from .token_budget import TokenBudgetManager, TokenAllocation
	from .lm_studio_client import RequestPriority, LLMResponse

	logger = logging.getLogger(__name__)


	class ModelType(Enum):
	"""Model specialization types for different agent functions."""
	RESEARCH = "research"
	ANALYSIS = "analysis"
	SYNTHESIS = "synthesis"
	CRITIC = "critic"
	GENERAL = "general"


	class GPUMemoryError(Exception):
	"""Raised when GPU memory allocation fails."""
	pass


	class ZeroGPUError(Exception):
	"""Raised when ZeroGPU operations fail."""
	pass


	class HuggingFaceConnectionError(Exception):
	"""Raised when cannot connect to HuggingFace services."""
	pass


	@dataclass
	class HFModelConfig:
	"""Configuration for a HuggingFace model with ZeroGPU optimization."""
	model_id: str
	max_tokens: int = 512
	temperature: float = 0.7
	top_p: float = 0.9
	repetition_penalty: float = 1.1
	use_cache: bool = True
	wait_for_model: bool = True
	# ZeroGPU specific settings
	use_zerogpu: bool = True
	gpu_memory_limit: Optional[float] = None # GB, None for auto
	torch_dtype: str = "float16" # torch dtype for GPU efficiency
	device_map: str = "auto"
	batch_size: int = 1
	# HF Pro settings
	priority: str = "normal" # normal, high for Pro accounts
	use_inference_api: bool = True # Fallback to Inference API
	local_model_path: Optional[str] = None


	@dataclass
	class HFResponse:
	"""Response from HuggingFace inference API with GPU metrics."""
	content: str
	model_used: str
	tokens_used: int
	response_time: float
	success: bool
	error: Optional[str] = None
	metadata: Optional[Dict[str, Any]] = None
	# ZeroGPU specific metrics
	gpu_memory_used: Optional[float] = None
	gpu_time: Optional[float] = None
	batch_processed: Optional[int] = None
	fallback_used: bool = False


	class HuggingFaceClient:
	"""
	HuggingFace Inference API client for Felix Framework.

	Provides model inference capabilities with token budget management,
	rate limiting, and agent specialization support.
	"""

	# ZeroGPU optimized models for different agent types
	DEFAULT_MODELS = {
	ModelType.RESEARCH: HFModelConfig(
	model_id="microsoft/DialoGPT-large", # Upgraded for ZeroGPU
	temperature=0.9,
	max_tokens=384,
	use_zerogpu=True,
	batch_size=2, # Can process multiple research queries
	torch_dtype="float16"
	),
	ModelType.ANALYSIS: HFModelConfig(
	model_id="meta-llama/Llama-3.1-8B-Instruct", # Better reasoning
	temperature=0.5,
	max_tokens=512,
	use_zerogpu=True,
	batch_size=1,
	torch_dtype="float16",
	priority="high" # Pro account priority for analysis
	),
	ModelType.SYNTHESIS: HFModelConfig(
	model_id="Qwen/Qwen2.5-7B-Instruct", # ZeroGPU-compatible synthesis (fits in 24GB)
	temperature=0.1,
	max_tokens=768,
	use_zerogpu=True,
	batch_size=1,
	torch_dtype="float16",
	gpu_memory_limit=8.0, # 7B model fits comfortably
	priority="high"
	),
	ModelType.CRITIC: HFModelConfig(
	model_id="microsoft/DialoGPT-large",
	temperature=0.3,
	max_tokens=384,
	use_zerogpu=True,
	batch_size=2,
	torch_dtype="float16"
	),
	ModelType.GENERAL: HFModelConfig(
	model_id="Qwen/Qwen2.5-7B-Instruct", # Good general purpose ZeroGPU model
	temperature=0.7,
	max_tokens=512,
	use_zerogpu=True,
	batch_size=1,
	torch_dtype="float16"
	)
	}

	def __init__(self,
	hf_token: Optional[str] = None,
	model_configs: Optional[Dict[ModelType, HFModelConfig]] = None,
	token_budget_manager: Optional[TokenBudgetManager] = None,
	max_concurrent_requests: int = 10,
	request_timeout: float = 30.0,
	# ZeroGPU specific parameters
	enable_zerogpu: bool = True,
	gpu_memory_threshold: float = 0.9, # Trigger cleanup at 90% memory
	batch_timeout: float = 5.0, # Max wait time for batching
	# LMStudioClient compatibility
	base_url: Optional[str] = None, # For API compatibility
	timeout: Optional[float] = None, # Alternative name for request_timeout
	debug_mode: bool = False):
	"""
	Initialize HuggingFace ZeroGPU-optimized client.

	Args:
	hf_token: HuggingFace API token (uses HF_TOKEN env var if None)
	model_configs: Custom model configurations by agent type
	token_budget_manager: Token budget manager for rate limiting
	max_concurrent_requests: Maximum concurrent API requests
	request_timeout: Request timeout in seconds
	enable_zerogpu: Enable ZeroGPU acceleration when available
	gpu_memory_threshold: GPU memory usage threshold for cleanup
	batch_timeout: Maximum wait time for request batching
	base_url: API base URL (for LMStudioClient compatibility)
	timeout: Request timeout (alternative parameter name)
	debug_mode: Enable verbose debug output
	"""
	# API compatibility with LMStudioClient
	self.hf_token = hf_token or os.getenv('HF_TOKEN')
	self.base_url = base_url # For compatibility (not used)
	self.timeout = timeout or request_timeout
	self.request_timeout = self.timeout
	self.debug_mode = debug_mode

	# Core configuration
	self.model_configs = model_configs or self.DEFAULT_MODELS
	self.token_budget_manager = token_budget_manager or TokenBudgetManager()
	self.max_concurrent_requests = max_concurrent_requests

	# ZeroGPU configuration
	self.enable_zerogpu = enable_zerogpu and ZEROGPU_AVAILABLE
	self.gpu_memory_threshold = gpu_memory_threshold
	self.batch_timeout = batch_timeout

	# Initialize HF clients
	self.hf_api = HfApi(token=self.hf_token)
	self.inference_clients = {}
	self._init_inference_clients()

	# ZeroGPU model management
	self.loaded_models = {} # Cache for loaded GPU models
	self.model_pipelines = {} # Transformers pipelines
	self.gpu_memory_usage = 0.0

	# Rate limiting and performance tracking
	self.semaphore = asyncio.Semaphore(max_concurrent_requests)
	self.request_counts = {}
	self.response_times = []
	self.error_counts = {}

	# Batch processing for ZeroGPU efficiency
	self.batch_queue = deque()
	self.batch_processor_task = None

	# LMStudioClient compatibility tracking
	self.total_tokens = 0
	self.total_requests = 0
	self.total_response_time = 0.0
	self.concurrent_requests = 0
	self._connection_verified = False

	# Session management
	self.session: Optional[aiohttp.ClientSession] = None

	# Initialize if ZeroGPU available
	if self.enable_zerogpu:
	self._initialize_zerogpu()

	def _init_inference_clients(self):
	"""Initialize inference clients for each model type."""
	for model_type, config in self.model_configs.items():
	try:
	client = InferenceClient(
	model=config.model_id,
	token=self.hf_token
	)
	self.inference_clients[model_type] = client
	logger.info(f"Initialized inference client for {model_type.value}: {config.model_id}")
	except Exception as e:
	logger.error(f"Failed to initialize client for {model_type.value}: {e}")
	# Fall back to general model
	if model_type != ModelType.GENERAL:
	self.inference_clients[model_type] = self.inference_clients.get(ModelType.GENERAL)

	def _initialize_zerogpu(self):
	"""Initialize ZeroGPU environment and check availability."""
	if not ZEROGPU_AVAILABLE:
	logger.warning("ZeroGPU not available, falling back to Inference API")
	return

	if TORCH_AVAILABLE and torch.cuda.is_available():
	logger.info(f"ZeroGPU initialized with {torch.cuda.device_count()} GPUs")
	for i in range(torch.cuda.device_count()):
	logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
	else:
	logger.warning("CUDA not available, ZeroGPU features disabled")
	self.enable_zerogpu = False

	# LMStudioClient Compatibility Methods

	def test_connection(self) -> bool:
	"""
	Test connection to HuggingFace services.

	Returns:
	True if connection successful, False otherwise
	"""
	try:
	# Test with a simple API call
	models = self.hf_api.list_models(limit=1)
	self._connection_verified = True
	return True
	except Exception as e:
	logger.warning(f"HuggingFace connection test failed: {e}")
	self._connection_verified = False
	return False

	def ensure_connection(self) -> None:
	"""Ensure connection to HuggingFace or raise exception."""
	if not self._connection_verified and not self.test_connection():
	raise HuggingFaceConnectionError(
	"Cannot connect to HuggingFace services. "
	"Check your internet connection and HF_TOKEN."
	)

	def complete(self, agent_id: str, system_prompt: str, user_prompt: str,
	temperature: float = 0.7, max_tokens: Optional[int] = 500,
	model: str = "local-model") -> LLMResponse:
	"""
	Synchronous completion request (LMStudioClient compatibility).

	Args:
	agent_id: Identifier for the requesting agent
	system_prompt: System/context prompt
	user_prompt: User query/task
	temperature: Sampling temperature (0.0-1.0)
	max_tokens: Maximum tokens in response
	model: Model identifier (mapped to agent type)

	Returns:
	LLMResponse with content and metadata

	Raises:
	HuggingFaceConnectionError: If cannot connect to HuggingFace
	"""
	# Run async method synchronously (check for existing loop)
	try:
	loop = asyncio.get_event_loop()
	except RuntimeError:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	# Map model to agent type
	agent_type = self._map_model_to_agent_type(model, agent_id)

	# Create combined prompt
	combined_prompt = f"System: {system_prompt}\n\nUser: {user_prompt}"

	result = loop.run_until_complete(
	self.generate_text(
	prompt=combined_prompt,
	agent_type=agent_type,
	temperature=temperature,
	max_tokens=max_tokens
	)
	)

	# Convert HFResponse to LLMResponse for compatibility
	return LLMResponse(
	content=result.content,
	tokens_used=result.tokens_used,
	response_time=result.response_time,
	model=result.model_used,
	temperature=temperature,
	agent_id=agent_id,
	timestamp=time.time()
	)
	finally:
	loop.close()

	async def complete_async(self, agent_id: str, system_prompt: str,
	user_prompt: str, temperature: float = 0.7,
	max_tokens: Optional[int] = None,
	model: str = "local-model",
	priority: RequestPriority = RequestPriority.NORMAL) -> LLMResponse:
	"""
	Asynchronous completion request (LMStudioClient compatibility).

	Args:
	agent_id: Identifier for the requesting agent
	system_prompt: System/context prompt
	user_prompt: User query/task
	temperature: Sampling temperature (0.0-1.0)
	max_tokens: Maximum tokens in response
	model: Model identifier
	priority: Request priority level

	Returns:
	LLMResponse with content and metadata
	"""
	# Map model to agent type
	agent_type = self._map_model_to_agent_type(model, agent_id)

	# Create combined prompt
	combined_prompt = f"System: {system_prompt}\n\nUser: {user_prompt}"

	result = await self.generate_text(
	prompt=combined_prompt,
	agent_type=agent_type,
	temperature=temperature,
	max_tokens=max_tokens
	)

	# Convert HFResponse to LLMResponse for compatibility
	return LLMResponse(
	content=result.content,
	tokens_used=result.tokens_used,
	response_time=result.response_time,
	model=result.model_used,
	temperature=temperature,
	agent_id=agent_id,
	timestamp=time.time()
	)

	def _map_model_to_agent_type(self, model: str, agent_id: str) -> ModelType:
	"""Map model identifier to agent type for compatibility."""
	# Try to infer from agent_id first
	agent_id_lower = agent_id.lower()
	if "research" in agent_id_lower:
	return ModelType.RESEARCH
	elif "analysis" in agent_id_lower or "analyze" in agent_id_lower:
	return ModelType.ANALYSIS
	elif "synthesis" in agent_id_lower or "synthesize" in agent_id_lower:
	return ModelType.SYNTHESIS
	elif "critic" in agent_id_lower or "critique" in agent_id_lower:
	return ModelType.CRITIC

	# Try to infer from model name
	model_lower = model.lower()
	if "research" in model_lower:
	return ModelType.RESEARCH
	elif "analysis" in model_lower or "thinking" in model_lower:
	return ModelType.ANALYSIS
	elif "synthesis" in model_lower or "quality" in model_lower:
	return ModelType.SYNTHESIS
	elif "critic" in model_lower:
	return ModelType.CRITIC

	return ModelType.GENERAL

	def get_usage_stats(self) -> Dict[str, Any]:
	"""
	Get client usage statistics (LMStudioClient compatibility).

	Returns:
	Dictionary with usage metrics
	"""
	avg_response_time = (self.total_response_time / self.total_requests
	if self.total_requests > 0 else 0.0)

	return {
	"total_requests": self.total_requests,
	"total_tokens": self.total_tokens,
	"total_response_time": self.total_response_time,
	"average_response_time": avg_response_time,
	"average_tokens_per_request": (self.total_tokens / self.total_requests
	if self.total_requests > 0 else 0.0),
	"connection_verified": self._connection_verified,
	"max_concurrent_requests": self.max_concurrent_requests,
	"current_concurrent_requests": self.concurrent_requests,
	"queue_size": len(self.batch_queue),
	# ZeroGPU specific stats
	"zerogpu_enabled": self.enable_zerogpu,
	"gpu_memory_usage": self.gpu_memory_usage,
	"loaded_models": list(self.loaded_models.keys())
	}

	def reset_stats(self) -> None:
	"""Reset usage statistics (LMStudioClient compatibility)."""
	self.total_tokens = 0
	self.total_requests = 0
	self.total_response_time = 0.0
	self.reset_performance_stats()

	def create_agent_system_prompt(self, agent_type: str, position_info: Dict[str, float],
	task_context: str = "") -> str:
	"""
	Create system prompt for Felix agent based on position and type (LMStudioClient compatibility).

	Args:
	agent_type: Type of agent (research, analysis, synthesis, critic)
	position_info: Agent's position on helix (x, y, z, radius, depth_ratio)
	task_context: Additional context about the current task

	Returns:
	Formatted system prompt
	"""
	# Use the same implementation as LMStudioClient but optimized for ZeroGPU models
	depth_ratio = position_info.get("depth_ratio", 0.0)
	radius = position_info.get("radius", 0.0)

	base_prompt = f"""🚨 IMPORTANT: You are a {agent_type} agent in the Felix multi-agent system optimized for ZeroGPU inference.

	⚡ ZeroGPU OPTIMIZATION: This response will be processed on GPU-accelerated infrastructure for optimal performance.

	Current Position:
	- Depth: {depth_ratio:.2f} (0.0 = top/start, 1.0 = bottom/end)
	- Radius: {radius:.2f} (decreasing as you progress)
	- Processing Stage: {"Early/Broad" if depth_ratio < 0.3 else "Middle/Focused" if depth_ratio < 0.7 else "Final/Precise"}

	Your Role Based on Position:
	"""

	if agent_type == "research":
	if depth_ratio < 0.3:
	base_prompt += "- MAXIMUM 5 bullet points with key facts ONLY\n"
	base_prompt += "- NO explanations, NO introductions, NO conclusions\n"
	base_prompt += "- Raw findings only - be direct\n"
	else:
	base_prompt += "- MAXIMUM 3 specific facts with numbers/dates/quotes\n"
	base_prompt += "- NO background context or elaboration\n"
	base_prompt += "- Prepare key points for analysis (concise)\n"

	elif agent_type == "analysis":
	base_prompt += "- MAXIMUM 2 numbered insights/patterns ONLY\n"
	base_prompt += "- NO background explanation or context\n"
	base_prompt += "- Direct analytical findings only\n"

	elif agent_type == "synthesis":
	base_prompt += "- FINAL output ONLY - NO process description\n"
	base_prompt += "- MAXIMUM 3 short paragraphs\n"
	base_prompt += "- Direct, actionable content without fluff\n"

	elif agent_type == "critic":
	base_prompt += "- MAXIMUM 3 specific issues/fixes ONLY\n"
	base_prompt += "- NO praise, NO general comments\n"
	base_prompt += "- Direct problems and solutions only\n"

	if task_context:
	base_prompt += f"\nTask Context: {task_context}\n"

	base_prompt += "\n⚡ ZeroGPU REMINDER: Response optimized for GPU acceleration. "
	base_prompt += "Early positions focus on breadth, later positions focus on depth and precision. BE CONCISE!"

	return base_prompt

	# ZeroGPU-specific methods

	async def _zerogpu_inference(self, model_id: str, prompt: str,
	generation_params: Dict[str, Any]) -> Dict[str, Any]:
	"""
	ZeroGPU-accelerated inference using direct model loading.

	Args:
	model_id: HuggingFace model identifier
	prompt: Input text prompt
	generation_params: Generation parameters

	Returns:
	Generation result with GPU metrics
	"""
	if not (TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE):
	raise ZeroGPUError("PyTorch and Transformers required for ZeroGPU inference")

	gpu_start_time = time.time()
	initial_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

	try:
	# Load or get cached model
	if model_id not in self.loaded_models:
	await self._load_model_to_gpu(model_id, generation_params)

	model, tokenizer = self.loaded_models[model_id]

	# Tokenize input
	inputs = tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=2048
	).to(model.device)

	# Generate with GPU acceleration
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=generation_params.get("max_new_tokens", 512),
	temperature=generation_params.get("temperature", 0.7),
	top_p=generation_params.get("top_p", 0.9),
	do_sample=generation_params.get("do_sample", True),
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=generation_params.get("repetition_penalty", 1.1)
	)

	# Decode response
	input_length = inputs['input_ids'].shape[1]
	generated_tokens = outputs[0][input_length:]
	response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

	# Calculate metrics
	gpu_end_time = time.time()
	final_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

	return {
	"generated_text": response_text,
	"gpu_time": gpu_end_time - gpu_start_time,
	"gpu_memory_used": (final_memory - initial_memory) / 1024**3, # GB
	"tokens_generated": len(generated_tokens)
	}

	except Exception as e:
	logger.error(f"ZeroGPU inference failed for {model_id}: {e}")
	# Cleanup on error
	await self._cleanup_gpu_memory()
	raise ZeroGPUError(f"GPU inference failed: {e}")

	async def _load_model_to_gpu(self, model_id: str, generation_params: Dict[str, Any]):
	"""Load model to GPU with memory management."""
	if not torch.cuda.is_available():
	raise ZeroGPUError("CUDA not available for model loading")

	try:
	# Check available memory
	available_memory = torch.cuda.get_device_properties(0).total_memory
	if self.gpu_memory_usage > self.gpu_memory_threshold * available_memory:
	await self._cleanup_gpu_memory()

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model with optimal settings
	torch_dtype = getattr(torch, generation_params.get("torch_dtype", "float16"))

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	device_map="auto",
	trust_remote_code=True,
	low_cpu_mem_usage=True
	)

	# Cache the loaded model
	self.loaded_models[model_id] = (model, tokenizer)

	# Update memory usage tracking
	current_memory = torch.cuda.memory_allocated()
	self.gpu_memory_usage = current_memory

	logger.info(f"Loaded {model_id} to GPU, memory usage: {current_memory / 1024**3:.2f} GB")

	except Exception as e:
	logger.error(f"Failed to load {model_id} to GPU: {e}")
	raise ZeroGPUError(f"Model loading failed: {e}")

	async def _cleanup_gpu_memory(self):
	"""Clean up GPU memory by unloading models."""
	if not torch.cuda.is_available():
	return

	# Clear model cache
	for model_id in list(self.loaded_models.keys()):
	model, tokenizer = self.loaded_models.pop(model_id)
	del model, tokenizer

	# Force garbage collection
	gc.collect()
	torch.cuda.empty_cache()

	self.gpu_memory_usage = 0.0
	logger.info("GPU memory cleaned up")

	async def __aenter__(self):
	"""Async context manager entry."""
	self.session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=self.request_timeout))
	return self

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	"""Async context manager exit."""
	if self.session:
	await self.session.close()

	# Cleanup GPU resources
	if self.enable_zerogpu:
	await self._cleanup_gpu_memory()

	async def close_async(self) -> None:
	"""Close async client and cleanup resources (LMStudioClient compatibility)."""
	if self.session:
	await self.session.close()

	if self.enable_zerogpu:
	await self._cleanup_gpu_memory()

	if self.batch_processor_task and not self.batch_processor_task.done():
	self.batch_processor_task.cancel()
	try:
	await self.batch_processor_task
	except asyncio.CancelledError:
	pass

	async def generate_text(self,
	prompt: str,
	agent_type: ModelType = ModelType.GENERAL,
	temperature: Optional[float] = None,
	max_tokens: Optional[int] = None,
	use_zerogpu: Optional[bool] = None,
	priority: RequestPriority = RequestPriority.NORMAL,
	**kwargs) -> HFResponse:
	"""
	Generate text using HuggingFace inference with ZeroGPU optimization.

	Args:
	prompt: Input prompt for text generation
	agent_type: Type of agent requesting generation
	temperature: Override temperature for this request
	max_tokens: Override max tokens for this request
	use_zerogpu: Force ZeroGPU usage (None for auto-detect)
	priority: Request priority for processing order
	**kwargs: Additional generation parameters

	Returns:
	HFResponse with generated text and metadata
	"""
	async with self.semaphore:
	start_time = time.time()
	self.concurrent_requests += 1

	try:
	# Get model configuration
	config = self.model_configs.get(agent_type, self.model_configs[ModelType.GENERAL])
	client = self.inference_clients.get(agent_type, self.inference_clients[ModelType.GENERAL])

	# Determine if we should use ZeroGPU
	should_use_zerogpu = (
	use_zerogpu if use_zerogpu is not None
	else (self.enable_zerogpu and config.use_zerogpu)
	)

	if not client and not should_use_zerogpu:
	return HFResponse(
	content="",
	model_used=config.model_id,
	tokens_used=0,
	response_time=0.0,
	success=False,
	error=f"No inference client available for {agent_type.value}",
	fallback_used=False
	)

	# Check token budget
	estimated_tokens = max_tokens or config.max_tokens
	if hasattr(self.token_budget_manager, 'can_allocate') and not self.token_budget_manager.can_allocate(estimated_tokens):
	return HFResponse(
	content="",
	model_used=config.model_id,
	tokens_used=0,
	response_time=time.time() - start_time,
	success=False,
	error="Insufficient token budget",
	fallback_used=False
	)

	# Prepare generation parameters
	generation_params = {
	"max_new_tokens": max_tokens or config.max_tokens,
	"temperature": temperature or config.temperature,
	"top_p": config.top_p,
	"repetition_penalty": config.repetition_penalty,
	"do_sample": True,
	"return_full_text": False,
	"torch_dtype": config.torch_dtype,
	**kwargs
	}

	response_data = None
	gpu_metrics = {}
	fallback_used = False

	# Try ZeroGPU first if enabled and available
	if should_use_zerogpu:
	try:
	if self.debug_mode:
	logger.info(f"Using ZeroGPU inference for {agent_type.value} with {config.model_id}")

	gpu_result = await self._zerogpu_inference(
	config.model_id, prompt, generation_params
	)

	response_data = [{
	"generated_text": gpu_result["generated_text"]
	}]

	gpu_metrics = {
	"gpu_time": gpu_result["gpu_time"],
	"gpu_memory_used": gpu_result["gpu_memory_used"],
	"tokens_generated": gpu_result["tokens_generated"]
	}

	except (ZeroGPUError, GPUMemoryError) as e:
	logger.warning(f"ZeroGPU failed, falling back to Inference API: {e}")
	fallback_used = True
	should_use_zerogpu = False

	# Fallback to Inference API if ZeroGPU failed or not enabled
	if not response_data:
	if not client:
	raise Exception("No inference method available")

	response_data = await self._make_inference_request(
	client=client,
	prompt=prompt,
	parameters=generation_params
	)
	fallback_used = not should_use_zerogpu

	# Process response
	if response_data and isinstance(response_data, list) and len(response_data) > 0:
	generated_text = response_data[0].get("generated_text", "")
	tokens_used = self._estimate_tokens(prompt + generated_text)

	# Allocate tokens if budget manager supports it
	allocation = None
	if hasattr(self.token_budget_manager, 'allocate_tokens'):
	allocation = self.token_budget_manager.allocate_tokens(tokens_used)

	# Track performance
	response_time = time.time() - start_time
	self._track_performance(agent_type, response_time, success=True)

	# Update compatibility stats
	self.total_tokens += tokens_used
	self.total_requests += 1
	self.total_response_time += response_time

	if self.debug_mode:
	method = "ZeroGPU" if (should_use_zerogpu and not fallback_used) else "Inference API"
	logger.info(f"✅ {method} response for {agent_type.value}: {len(generated_text)} chars, {tokens_used} tokens, {response_time:.2f}s")

	return HFResponse(
	content=generated_text,
	model_used=config.model_id,
	tokens_used=tokens_used,
	response_time=response_time,
	success=True,
	metadata={
	"agent_type": agent_type.value,
	"allocation_id": allocation.allocation_id if allocation else None,
	"parameters": generation_params,
	"method": "ZeroGPU" if (should_use_zerogpu and not fallback_used) else "Inference API"
	},
	gpu_memory_used=gpu_metrics.get("gpu_memory_used"),
	gpu_time=gpu_metrics.get("gpu_time"),
	fallback_used=fallback_used
	)
	else:
	return HFResponse(
	content="",
	model_used=config.model_id,
	tokens_used=0,
	response_time=time.time() - start_time,
	success=False,
	error="Empty or invalid response from API",
	fallback_used=fallback_used
	)

	except Exception as e:
	self._track_performance(agent_type, time.time() - start_time, success=False)
	logger.error(f"HF API request failed for {agent_type.value}: {e}")

	return HFResponse(
	content="",
	model_used=config.model_id,
	tokens_used=0,
	response_time=time.time() - start_time,
	success=False,
	error=str(e),
	fallback_used=False
	)

	finally:
	self.concurrent_requests -= 1

	async def _make_inference_request(self, client: InferenceClient, prompt: str, parameters: Dict[str, Any]):
	"""Make inference request with proper error handling and Pro account optimizations."""
	try:
	# Remove ZeroGPU-specific parameters for Inference API
	api_params = parameters.copy()
	api_params.pop('torch_dtype', None)

	# Use text generation task with Pro account optimizations
	response = await asyncio.wait_for(
	asyncio.create_task(
	client.text_generation(
	prompt=prompt,
	**api_params
	)
	),
	timeout=self.request_timeout
	)
	return [{"generated_text": response}] if isinstance(response, str) else response

	except asyncio.TimeoutError:
	raise Exception(f"Request timeout after {self.request_timeout}s")
	except Exception as e:
	raise Exception(f"Inference request failed: {e}")

	def _estimate_tokens(self, text: str) -> int:
	"""Estimate token count for text (rough approximation)."""
	# Simple approximation: ~4 characters per token on average
	return max(1, len(text) // 4)

	def _track_performance(self, agent_type: ModelType, response_time: float, success: bool):
	"""Track performance metrics for monitoring."""
	# Track request counts
	self.request_counts[agent_type] = self.request_counts.get(agent_type, 0) + 1

	# Track response times
	self.response_times.append(response_time)
	if len(self.response_times) > 1000: # Keep last 1000 responses
	self.response_times = self.response_times[-1000:]

	# Track errors
	if not success:
	self.error_counts[agent_type] = self.error_counts.get(agent_type, 0) + 1

	def get_performance_stats(self) -> Dict[str, Any]:
	"""Get performance statistics with ZeroGPU metrics."""
	avg_response_time = sum(self.response_times) / len(self.response_times) if self.response_times else 0
	total_requests = sum(self.request_counts.values())
	total_errors = sum(self.error_counts.values())
	error_rate = (total_errors / total_requests) if total_requests > 0 else 0

	# ZeroGPU specific stats
	zerogpu_stats = {}
	if self.enable_zerogpu and torch.cuda.is_available():
	zerogpu_stats = {
	"gpu_available": True,
	"gpu_count": torch.cuda.device_count(),
	"gpu_memory_allocated": torch.cuda.memory_allocated() / 1024**3, # GB
	"gpu_memory_cached": torch.cuda.memory_reserved() / 1024**3, # GB
	"loaded_models": list(self.loaded_models.keys()),
	"current_gpu_memory_usage": self.gpu_memory_usage / 1024**3 if self.gpu_memory_usage else 0.0
	}

	base_stats = {
	"total_requests": total_requests,
	"total_errors": total_errors,
	"error_rate": error_rate,
	"avg_response_time": avg_response_time,
	"requests_by_type": dict(self.request_counts),
	"errors_by_type": dict(self.error_counts),
	"zerogpu_enabled": self.enable_zerogpu,
	"zerogpu_available": ZEROGPU_AVAILABLE,
	}

	if hasattr(self.token_budget_manager, 'get_status'):
	base_stats["token_budget_status"] = self.token_budget_manager.get_status()

	base_stats.update(zerogpu_stats)
	return base_stats

	def reset_performance_stats(self):
	"""Reset performance tracking."""
	self.request_counts.clear()
	self.response_times.clear()
	self.error_counts.clear()

	async def health_check(self) -> Dict[str, bool]:
	"""Check health of all configured models."""
	health_status = {}

	for model_type, config in self.model_configs.items():
	try:
	# Simple test request
	response = await self.generate_text(
	prompt="Hello",
	agent_type=model_type,
	max_tokens=10
	)
	health_status[model_type.value] = response.success
	except Exception as e:
	logger.error(f"Health check failed for {model_type.value}: {e}")
	health_status[model_type.value] = False

	return health_status

	def get_available_models(self) -> Dict[str, str]:
	"""Get list of available models by type."""
	return {
	model_type.value: config.model_id
	for model_type, config in self.model_configs.items()
	}

	async def batch_generate(self,
	prompts: List[str],
	agent_types: List[ModelType],
	use_zerogpu_batching: bool = True,
	**kwargs) -> List[HFResponse]:
	"""
	Generate text for multiple prompts with ZeroGPU batching optimization.

	Args:
	prompts: List of input prompts
	agent_types: List of agent types (must match prompts length)
	use_zerogpu_batching: Enable ZeroGPU batch processing
	**kwargs: Additional generation parameters

	Returns:
	List of HFResponse objects
	"""
	if len(prompts) != len(agent_types):
	raise ValueError("Prompts and agent_types lists must have same length")

	# Use ZeroGPU batching for same model types if enabled
	if use_zerogpu_batching and self.enable_zerogpu:
	try:
	return await self._zerogpu_batch_process(prompts, agent_types, **kwargs)
	except Exception as e:
	logger.warning(f"ZeroGPU batching failed, falling back to individual requests: {e}")

	# Create tasks for concurrent execution
	tasks = [
	self.generate_text(prompt=prompt, agent_type=agent_type, **kwargs)
	for prompt, agent_type in zip(prompts, agent_types)
	]

	# Execute concurrently with semaphore limiting
	results = await asyncio.gather(*tasks, return_exceptions=True)

	# Convert exceptions to error responses
	processed_results = []
	for i, result in enumerate(results):
	if isinstance(result, Exception):
	processed_results.append(HFResponse(
	content="",
	model_used=self.model_configs[agent_types[i]].model_id,
	tokens_used=0,
	response_time=0.0,
	success=False,
	error=str(result)
	))
	else:
	processed_results.append(result)

	return processed_results

	async def _zerogpu_batch_process(self, prompts: List[str], agent_types: List[ModelType], **kwargs) -> List[HFResponse]:
	"""
	Process multiple prompts using ZeroGPU batching for efficiency.

	Args:
	prompts: List of input prompts
	agent_types: List of agent types
	**kwargs: Additional parameters

	Returns:
	List of HFResponse objects
	"""
	# Group by model type for efficient batching
	model_groups = {}
	for i, (prompt, agent_type) in enumerate(zip(prompts, agent_types)):
	config = self.model_configs.get(agent_type, self.model_configs[ModelType.GENERAL])
	model_id = config.model_id

	if model_id not in model_groups:
	model_groups[model_id] = []
	model_groups[model_id].append((i, prompt, agent_type, config))

	# Process each model group with GPU batching
	results = [None] * len(prompts)
	start_time = time.time()

	for model_id, group_items in model_groups.items():
	batch_prompts = [item[1] for item in group_items]
	batch_configs = [item[3] for item in group_items]

	try:
	# Use first config as representative
	base_config = batch_configs[0]
	generation_params = {
	"max_new_tokens": kwargs.get('max_tokens', base_config.max_tokens),
	"temperature": kwargs.get('temperature', base_config.temperature),
	"top_p": base_config.top_p,
	"repetition_penalty": base_config.repetition_penalty,
	"do_sample": True,
	"torch_dtype": base_config.torch_dtype,
	}

	# Process batch on GPU
	batch_results = await self._zerogpu_batch_inference(
	model_id, batch_prompts, generation_params
	)

	# Map results back to original positions
	for (orig_idx, prompt, agent_type, config), batch_result in zip(group_items, batch_results):
	tokens_used = self._estimate_tokens(prompt + batch_result["generated_text"])
	response_time = batch_result.get("response_time", time.time() - start_time)

	results[orig_idx] = HFResponse(
	content=batch_result["generated_text"],
	model_used=model_id,
	tokens_used=tokens_used,
	response_time=response_time,
	success=True,
	metadata={
	"agent_type": agent_type.value,
	"method": "ZeroGPU-Batch",
	"batch_size": len(batch_prompts)
	},
	gpu_memory_used=batch_result.get("gpu_memory_used"),
	gpu_time=batch_result.get("gpu_time"),
	batch_processed=len(batch_prompts),
	fallback_used=False
	)

	except Exception as e:
	# Fall back to individual processing for this model group
	logger.warning(f"Batch processing failed for {model_id}, using individual requests: {e}")
	for orig_idx, prompt, agent_type, config in group_items:
	try:
	individual_result = await self.generate_text(
	prompt=prompt,
	agent_type=agent_type,
	use_zerogpu=False, # Force Inference API fallback
	**kwargs
	)
	results[orig_idx] = individual_result
	except Exception as individual_e:
	results[orig_idx] = HFResponse(
	content="",
	model_used=config.model_id,
	tokens_used=0,
	response_time=0.0,
	success=False,
	error=str(individual_e),
	fallback_used=True
	)

	return results

	async def _zerogpu_batch_inference(self, model_id: str, prompts: List[str], generation_params: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Process multiple prompts in a single ZeroGPU session for efficiency.

	Args:
	model_id: HuggingFace model identifier
	prompts: List of input prompts
	generation_params: Generation parameters

	Returns:
	List of generation results
	"""
	if not (TORCH_AVAILABLE and TRANSFORMERS_AVAILABLE):
	raise ZeroGPUError("PyTorch and Transformers required for batch ZeroGPU inference")

	gpu_start_time = time.time()
	initial_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

	try:
	# Load or get cached model
	if model_id not in self.loaded_models:
	await self._load_model_to_gpu(model_id, generation_params)

	model, tokenizer = self.loaded_models[model_id]
	results = []

	# Process prompts individually but in the same GPU session
	for i, prompt in enumerate(prompts):
	prompt_start = time.time()

	# Tokenize input
	inputs = tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=2048
	).to(model.device)

	# Generate with GPU acceleration
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=generation_params.get("max_new_tokens", 512),
	temperature=generation_params.get("temperature", 0.7),
	top_p=generation_params.get("top_p", 0.9),
	do_sample=generation_params.get("do_sample", True),
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty=generation_params.get("repetition_penalty", 1.1)
	)

	# Decode response
	input_length = inputs['input_ids'].shape[1]
	generated_tokens = outputs[0][input_length:]
	response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

	prompt_end = time.time()
	results.append({
	"generated_text": response_text,
	"response_time": prompt_end - prompt_start,
	"tokens_generated": len(generated_tokens)
	})

	# Calculate overall GPU metrics
	gpu_end_time = time.time()
	final_memory = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0

	# Add GPU metrics to all results
	total_gpu_time = gpu_end_time - gpu_start_time
	gpu_memory_used = (final_memory - initial_memory) / 1024**3 # GB

	for result in results:
	result["gpu_time"] = total_gpu_time / len(results) # Distribute GPU time
	result["gpu_memory_used"] = gpu_memory_used / len(results) # Distribute memory usage

	return results

	except Exception as e:
	logger.error(f"ZeroGPU batch inference failed for {model_id}: {e}")
	# Cleanup on error
	await self._cleanup_gpu_memory()
	raise ZeroGPUError(f"GPU batch inference failed: {e}")


	# Utility functions for Felix Framework integration

	def create_felix_hf_client(token_budget: int = 50000,
	concurrent_requests: int = 5,
	enable_zerogpu: bool = True,
	debug_mode: bool = False) -> HuggingFaceClient:
	"""
	Create ZeroGPU-optimized HuggingFace client for Felix Framework deployment on HF Spaces.

	Args:
	token_budget: Total token budget for session
	concurrent_requests: Maximum concurrent requests
	enable_zerogpu: Enable ZeroGPU acceleration
	debug_mode: Enable debug logging

	Returns:
	Configured HuggingFaceClient instance optimized for ZeroGPU and HF Pro
	"""
	# Create token manager with Felix-specific settings
	token_manager = TokenBudgetManager(
	base_budget=token_budget // 4, # Distribute among typical 4 agent types
	strict_mode=True # Enable for ZeroGPU efficiency
	)

	# ZeroGPU and HF Pro optimized model configurations
	felix_models = {
	ModelType.RESEARCH: HFModelConfig(
	model_id="microsoft/DialoGPT-large", # Upgraded for better performance
	temperature=0.9,
	max_tokens=256,
	top_p=0.95,
	use_zerogpu=True,
	batch_size=2, # Efficient batching for research queries
	torch_dtype="float16",
	priority="normal"
	),
	ModelType.ANALYSIS: HFModelConfig(
	model_id="meta-llama/Llama-3.1-8B-Instruct", # Better reasoning capability
	temperature=0.5,
	max_tokens=384,
	top_p=0.9,
	use_zerogpu=True,
	batch_size=1,
	torch_dtype="float16",
	priority="high" # Pro account priority
	),
	ModelType.SYNTHESIS: HFModelConfig(
	model_id="Qwen/Qwen2.5-7B-Instruct", # ZeroGPU-compatible synthesis (fits in 24GB)
	temperature=0.1,
	max_tokens=512,
	top_p=0.85,
	use_zerogpu=True,
	batch_size=1,
	torch_dtype="float16",
	gpu_memory_limit=8.0, # 7B model fits comfortably
	priority="high"
	),
	ModelType.CRITIC: HFModelConfig(
	model_id="microsoft/DialoGPT-large",
	temperature=0.3,
	max_tokens=256,
	top_p=0.9,
	use_zerogpu=True,
	batch_size=2,
	torch_dtype="float16",
	priority="normal"
	)
	}

	return HuggingFaceClient(
	model_configs=felix_models,
	token_budget_manager=token_manager,
	max_concurrent_requests=concurrent_requests,
	request_timeout=45.0, # Longer timeout for ZeroGPU model loading
	enable_zerogpu=enable_zerogpu,
	gpu_memory_threshold=0.85, # Conservative memory management
	batch_timeout=3.0, # Shorter batching timeout for responsiveness
	debug_mode=debug_mode
	)


	def create_default_client(max_concurrent_requests: int = 4,
	enable_zerogpu: bool = True) -> HuggingFaceClient:
	"""Create ZeroGPU HuggingFace client with default settings (LMStudioClient compatibility)."""
	return create_felix_hf_client(
	concurrent_requests=max_concurrent_requests,
	enable_zerogpu=enable_zerogpu
	)


	# Pro account specific optimizations
	def get_pro_account_models() -> Dict[ModelType, HFModelConfig]:
	"""
	Get model configurations optimized for HF Pro accounts with access to premium models.

	Returns:
	Dictionary of premium model configurations
	"""
	return {
	ModelType.RESEARCH: HFModelConfig(
	model_id="meta-llama/Llama-3.1-8B-Instruct", # Premium access
	temperature=0.9,
	max_tokens=384,
	use_zerogpu=True,
	batch_size=3,
	priority="high"
	),
	ModelType.ANALYSIS: HFModelConfig(
	model_id="meta-llama/Llama-3.1-8B-Instruct", # ZeroGPU-compatible analysis (fits in 24GB)
	temperature=0.5,
	max_tokens=512,
	use_zerogpu=True,
	batch_size=1,
	gpu_memory_limit=10.0, # 8B model fits in ZeroGPU
	priority="high"
	),
	ModelType.SYNTHESIS: HFModelConfig(
	model_id="Qwen/Qwen2.5-7B-Instruct", # ZeroGPU-compatible synthesis (fits in 24GB)
	temperature=0.1,
	max_tokens=768,
	use_zerogpu=True,
	batch_size=1,
	gpu_memory_limit=8.0, # 7B model fits in ZeroGPU
	priority="high"
	),
	ModelType.CRITIC: HFModelConfig(
	model_id="meta-llama/Llama-3.1-8B-Instruct",
	temperature=0.3,
	max_tokens=384,
	use_zerogpu=True,
	batch_size=2,
	priority="high"
	)
	}


	# ZeroGPU deployment helpers
	def estimate_gpu_requirements(model_configs: Dict[ModelType, HFModelConfig]) -> Dict[str, float]:
	"""
	Estimate GPU memory requirements for given model configurations.

	Args:
	model_configs: Model configurations to analyze

	Returns:
	Dictionary with memory estimates in GB
	"""
	# Rough model size estimates (in GB)
	model_sizes = {
	"microsoft/DialoGPT-medium": 1.5,
	"microsoft/DialoGPT-large": 3.0,
	"meta-llama/Llama-3.1-8B-Instruct": 16.0,
	"meta-llama/Llama-3.1-13B-Instruct": 26.0,
	"meta-llama/Llama-3.1-70B-Instruct": 140.0,
	"Qwen/Qwen2.5-7B-Instruct": 14.0
	}

	requirements = {}
	total_memory = 0.0
	max_single_model = 0.0

	for agent_type, config in model_configs.items():
	model_memory = model_sizes.get(config.model_id, 8.0) # Default 8GB
	requirements[f"{agent_type.value}_memory"] = model_memory
	total_memory += model_memory
	max_single_model = max(max_single_model, model_memory)

	requirements.update({
	"total_memory_if_all_loaded": total_memory,
	"max_single_model_memory": max_single_model,
	"recommended_gpu_memory": max_single_model * 1.5, # 50% buffer
	"minimum_gpu_memory": max_single_model * 1.2 # 20% buffer
	})

	return requirements


	# Export main classes and functions
	__all__ = [
	'HuggingFaceClient',
	'HFResponse',
	'HFModelConfig',
	'ModelType',
	'GPUMemoryError',
	'ZeroGPUError',
	'HuggingFaceConnectionError',
	'create_felix_hf_client',
	'create_default_client',
	'get_pro_account_models',
	'estimate_gpu_requirements',
	'ZEROGPU_AVAILABLE',
	'TORCH_AVAILABLE',
	'TRANSFORMERS_AVAILABLE'
	]