""" HuggingFace Pro Account Optimization Configuration for Felix Framework This module provides comprehensive optimization strategies for leveraging HF Pro account features, ZeroGPU capabilities, and cost-effective deployment while maximizing performance. Key Features: - Premium model access with intelligent model selection - ZeroGPU optimization for cost efficiency - Advanced caching strategies for reduced compute costs - Performance monitoring with cost analytics - Scalable architecture for increased user loads - Automated resource allocation and optimization HF Pro Benefits Leveraged: - Higher concurrent user limits - Priority access to premium models - Enhanced ZeroGPU allocation and priority - Advanced analytics and usage monitoring - Priority support and faster deployment queues """ import os import json import time import asyncio import logging from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass, field from enum import Enum from collections import defaultdict, OrderedDict import numpy as np from datetime import datetime, timedelta logger = logging.getLogger(__name__) class ModelTier(Enum): """Model tiers based on HF Pro access and performance.""" PREMIUM_80B = "premium_80b" # Qwen3-Next-80B-A3B series EFFICIENT_30B = "efficient_30b" # Specialized models FAST_7B = "fast_7b" # Quick response models EDGE_1B = "edge_1b" # Ultra-fast edge models class ResourceUsageLevel(Enum): """Resource usage levels for cost optimization.""" MINIMAL = "minimal" # <10% GPU usage MODERATE = "moderate" # 10-30% GPU usage STANDARD = "standard" # 30-60% GPU usage INTENSIVE = "intensive" # 60-80% GPU usage MAXIMUM = "maximum" # 80%+ GPU usage @dataclass class ModelConfig: """Configuration for a premium model.""" model_id: str tier: ModelTier max_tokens: int = 1024 temperature: float = 0.7 cost_per_token: float = 0.0001 avg_response_time: float = 2.0 quality_score: float = 0.85 supports_zerogpu: bool = True concurrent_limit: int = 5 @dataclass class UsageMetrics: """Usage and cost metrics tracking.""" total_requests: int = 0 total_tokens: int = 0 total_cost: float = 0.0 avg_response_time: float = 0.0 success_rate: float = 1.0 gpu_utilization: float = 0.0 cache_hit_rate: float = 0.0 concurrent_users: int = 0 peak_concurrent: int = 0 last_reset: datetime = field(default_factory=datetime.now) class HFProOptimizer: """ HuggingFace Pro account optimizer for Felix Framework. Provides intelligent model selection, cost optimization, and performance monitoring specifically designed for HF Pro account features. """ # Premium model configurations optimized for Felix Framework PREMIUM_MODELS = { ModelTier.PREMIUM_80B: [ ModelConfig( model_id="Qwen/Qwen3-Next-80B-A3B-Instruct", tier=ModelTier.PREMIUM_80B, max_tokens=2048, temperature=0.1, cost_per_token=0.0002, avg_response_time=4.5, quality_score=0.95, concurrent_limit=3 ), ModelConfig( model_id="Qwen/Qwen3-Next-80B-A3B-Thinking", tier=ModelTier.PREMIUM_80B, max_tokens=1536, temperature=0.3, cost_per_token=0.00018, avg_response_time=3.8, quality_score=0.93, concurrent_limit=3 ) ], ModelTier.EFFICIENT_30B: [ ModelConfig( model_id="Alibaba-NLP/Tongyi-DeepResearch-30B-A3B", tier=ModelTier.EFFICIENT_30B, max_tokens=1024, temperature=0.5, cost_per_token=0.00012, avg_response_time=2.5, quality_score=0.88, concurrent_limit=5 ), ModelConfig( model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct", tier=ModelTier.EFFICIENT_30B, max_tokens=1024, temperature=0.2, cost_per_token=0.0001, avg_response_time=2.2, quality_score=0.86, concurrent_limit=6 ) ], ModelTier.FAST_7B: [ ModelConfig( model_id="LLM360/K2-Think", tier=ModelTier.FAST_7B, max_tokens=512, temperature=0.7, cost_per_token=0.00005, avg_response_time=1.2, quality_score=0.82, concurrent_limit=10 ) ], ModelTier.EDGE_1B: [ ModelConfig( model_id="facebook/MobileLLM-R1-950M", tier=ModelTier.EDGE_1B, max_tokens=256, temperature=0.8, cost_per_token=0.00002, avg_response_time=0.5, quality_score=0.75, concurrent_limit=20 ) ] } # Felix agent type to model tier mapping AGENT_MODEL_MAPPING = { "synthesis": ModelTier.PREMIUM_80B, # Highest quality output "analysis": ModelTier.EFFICIENT_30B, # Balanced performance "research": ModelTier.FAST_7B, # Quick exploration "critic": ModelTier.EFFICIENT_30B, # Thorough evaluation "general": ModelTier.FAST_7B # Default fast processing } def __init__(self, hf_token: Optional[str] = None, monthly_budget: float = 100.0, target_cost_per_request: float = 0.05, enable_advanced_caching: bool = True, enable_cost_alerts: bool = True): """ Initialize HF Pro optimizer. Args: hf_token: HuggingFace API token with Pro access monthly_budget: Monthly budget in USD target_cost_per_request: Target cost per Felix request enable_advanced_caching: Enable intelligent caching enable_cost_alerts: Enable cost monitoring alerts """ self.hf_token = hf_token or os.getenv("HF_TOKEN") self.monthly_budget = monthly_budget self.target_cost_per_request = target_cost_per_request self.enable_advanced_caching = enable_advanced_caching self.enable_cost_alerts = enable_cost_alerts # Initialize metrics tracking self.metrics = UsageMetrics() self.hourly_metrics: Dict[str, UsageMetrics] = defaultdict(UsageMetrics) self.model_performance: Dict[str, Dict] = defaultdict(dict) # Advanced caching system self.cache = OrderedDict() if enable_advanced_caching else None self.cache_stats = {"hits": 0, "misses": 0, "size": 0} # Resource monitoring self.resource_usage = ResourceUsageLevel.MINIMAL self.concurrent_requests = 0 self.request_queue = asyncio.Queue() logger.info(f"HF Pro Optimizer initialized - Budget: ${monthly_budget}/month") def select_optimal_model(self, agent_type: str, task_complexity: str, current_load: int = 0, budget_remaining: float = 1.0) -> ModelConfig: """ Select optimal model based on agent type, complexity, and constraints. Args: agent_type: Type of Felix agent requesting model task_complexity: Complexity level (demo/simple/medium/complex/research) current_load: Current system load (0-100) budget_remaining: Remaining budget percentage (0.0-1.0) Returns: Optimal ModelConfig for the request """ # Get base tier for agent type base_tier = self.AGENT_MODEL_MAPPING.get(agent_type, ModelTier.FAST_7B) # Adjust tier based on complexity and constraints if task_complexity in ["research", "complex"] and budget_remaining > 0.3: # Use premium models for complex tasks if budget allows if base_tier in [ModelTier.EFFICIENT_30B, ModelTier.PREMIUM_80B]: target_tier = ModelTier.PREMIUM_80B else: target_tier = ModelTier.EFFICIENT_30B elif current_load > 70 or budget_remaining < 0.2: # Use efficient models under high load or low budget if base_tier == ModelTier.PREMIUM_80B: target_tier = ModelTier.EFFICIENT_30B elif base_tier == ModelTier.EFFICIENT_30B: target_tier = ModelTier.FAST_7B else: target_tier = ModelTier.EDGE_1B else: target_tier = base_tier # Select best model from tier available_models = self.PREMIUM_MODELS.get(target_tier, []) if not available_models: # Fallback to fast tier available_models = self.PREMIUM_MODELS[ModelTier.FAST_7B] # Select model with best performance/cost ratio for current load best_model = min(available_models, key=lambda m: self._calculate_selection_score(m, current_load)) logger.info(f"Selected {best_model.model_id} for {agent_type} agent (complexity: {task_complexity})") return best_model def _calculate_selection_score(self, model: ModelConfig, current_load: int) -> float: """Calculate model selection score (lower is better).""" # Base score from cost per token score = model.cost_per_token * 1000 # Adjust for current load (prefer faster models under high load) if current_load > 50: score += model.avg_response_time * 0.5 # Prefer models with higher quality score -= model.quality_score * 0.2 # Prefer models with higher concurrent limits under load if current_load > 30: score -= (model.concurrent_limit / 20) * 0.1 return score @staticmethod def create_zerogpu_decorator(): """Create ZeroGPU decorator for cost-efficient GPU usage.""" try: import spaces return spaces.GPU(duration=120) # 2-minute GPU allocation except ImportError: logger.warning("ZeroGPU not available - running without GPU optimization") return lambda x: x def estimate_request_cost(self, agent_count: int, complexity: str, estimated_tokens_per_agent: int = 300) -> Dict[str, Any]: """ Estimate cost for a Felix Framework request. Args: agent_count: Number of agents in the request complexity: Task complexity level estimated_tokens_per_agent: Estimated tokens per agent Returns: Cost estimation with breakdown """ total_cost = 0.0 model_breakdown = {} # Estimate cost for each agent type agent_types = ["research", "analysis", "synthesis", "critic"] agents_per_type = agent_count // len(agent_types) for agent_type in agent_types: model = self.select_optimal_model( agent_type=agent_type, task_complexity=complexity, budget_remaining=1.0 # Full budget for estimation ) type_cost = (agents_per_type * estimated_tokens_per_agent * model.cost_per_token) total_cost += type_cost model_breakdown[agent_type] = { "model_id": model.model_id, "agents": agents_per_type, "estimated_tokens": agents_per_type * estimated_tokens_per_agent, "cost": type_cost } return { "total_estimated_cost": total_cost, "cost_per_agent": total_cost / agent_count, "model_breakdown": model_breakdown, "within_target": total_cost <= self.target_cost_per_request, "budget_utilization": total_cost / self.target_cost_per_request } def get_cache_key(self, task_input: str, agent_type: str, complexity: str) -> str: """Generate cache key for task input.""" import hashlib content = f"{task_input}_{agent_type}_{complexity}" return hashlib.md5(content.encode()).hexdigest() def get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]: """Get cached result if available.""" if not self.cache: return None if cache_key in self.cache: # Move to end (LRU) result = self.cache.pop(cache_key) self.cache[cache_key] = result self.cache_stats["hits"] += 1 return result self.cache_stats["misses"] += 1 return None def cache_result(self, cache_key: str, result: Dict[str, Any], max_cache_size: int = 1000): """Cache a result.""" if not self.cache: return # Remove oldest if at capacity if len(self.cache) >= max_cache_size and cache_key not in self.cache: self.cache.popitem(last=False) self.cache[cache_key] = result self.cache_stats["size"] = len(self.cache) def update_metrics(self, model_id: str, tokens_used: int, response_time: float, success: bool, cost: float): """Update usage metrics.""" # Update global metrics self.metrics.total_requests += 1 self.metrics.total_tokens += tokens_used self.metrics.total_cost += cost # Update running averages self.metrics.avg_response_time = ( (self.metrics.avg_response_time * (self.metrics.total_requests - 1) + response_time) / self.metrics.total_requests ) if success: success_count = self.metrics.total_requests * self.metrics.success_rate self.metrics.success_rate = (success_count + 1) / self.metrics.total_requests else: success_count = self.metrics.total_requests * self.metrics.success_rate self.metrics.success_rate = success_count / self.metrics.total_requests # Update hourly metrics hour_key = datetime.now().strftime("%Y-%m-%d-%H") hourly = self.hourly_metrics[hour_key] hourly.total_requests += 1 hourly.total_tokens += tokens_used hourly.total_cost += cost # Update model performance tracking if model_id not in self.model_performance: self.model_performance[model_id] = { "requests": 0, "avg_response_time": 0.0, "success_rate": 1.0, "total_cost": 0.0 } model_stats = self.model_performance[model_id] model_stats["requests"] += 1 model_stats["avg_response_time"] = ( (model_stats["avg_response_time"] * (model_stats["requests"] - 1) + response_time) / model_stats["requests"] ) model_stats["total_cost"] += cost # Check for cost alerts if self.enable_cost_alerts: self._check_cost_alerts() def _check_cost_alerts(self): """Check for cost threshold alerts.""" daily_budget = self.monthly_budget / 30 current_daily_cost = sum( metrics.total_cost for hour, metrics in self.hourly_metrics.items() if hour.startswith(datetime.now().strftime("%Y-%m-%d")) ) if current_daily_cost > daily_budget * 0.8: logger.warning(f"Daily cost approaching limit: ${current_daily_cost:.2f} / ${daily_budget:.2f}") if current_daily_cost > daily_budget: logger.error(f"Daily budget exceeded: ${current_daily_cost:.2f} / ${daily_budget:.2f}") def get_performance_dashboard(self) -> Dict[str, Any]: """Get comprehensive performance dashboard data.""" cache_hit_rate = ( self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"]) if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0 ) return { "overview": { "total_requests": self.metrics.total_requests, "total_cost": self.metrics.total_cost, "avg_cost_per_request": ( self.metrics.total_cost / self.metrics.total_requests if self.metrics.total_requests > 0 else 0 ), "success_rate": self.metrics.success_rate, "avg_response_time": self.metrics.avg_response_time }, "budget": { "monthly_budget": self.monthly_budget, "spent_this_month": self.metrics.total_cost, "remaining_budget": self.monthly_budget - self.metrics.total_cost, "burn_rate": self.metrics.total_cost / max(1, (datetime.now().day)), "projected_monthly": self.metrics.total_cost / max(1, (datetime.now().day)) * 30 }, "performance": { "cache_hit_rate": cache_hit_rate, "cache_size": self.cache_stats["size"], "concurrent_users": self.metrics.concurrent_users, "peak_concurrent": self.metrics.peak_concurrent }, "models": { model_id: { "requests": stats["requests"], "avg_response_time": stats["avg_response_time"], "total_cost": stats["total_cost"], "cost_per_request": stats["total_cost"] / max(1, stats["requests"]) } for model_id, stats in self.model_performance.items() }, "optimization_suggestions": self._get_optimization_suggestions() } def _get_optimization_suggestions(self) -> List[str]: """Generate optimization suggestions based on usage patterns.""" suggestions = [] # Cache efficiency cache_hit_rate = ( self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"]) if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0 ) if cache_hit_rate < 0.3: suggestions.append("Consider increasing cache size or improving cache key strategy") # Cost efficiency avg_cost = ( self.metrics.total_cost / self.metrics.total_requests if self.metrics.total_requests > 0 else 0 ) if avg_cost > self.target_cost_per_request * 1.2: suggestions.append("Consider using more efficient models for routine tasks") # Performance optimization if self.metrics.avg_response_time > 5.0: suggestions.append("Consider using faster models or reducing complexity for real-time tasks") # Budget management if self.metrics.total_cost > self.monthly_budget * 0.8: suggestions.append("Approaching monthly budget limit - consider cost controls") return suggestions async def optimize_request_flow(self, task_requests: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Optimize a batch of Felix Framework requests for cost and performance. Args: task_requests: List of task request dictionaries Returns: Optimized request configurations """ optimized_requests = [] # Sort requests by priority and complexity sorted_requests = sorted(task_requests, key=lambda x: (x.get("priority", 5), x.get("complexity", "medium"))) current_load = len(sorted_requests) budget_remaining = ( (self.monthly_budget - self.metrics.total_cost) / self.monthly_budget ) for i, request in enumerate(sorted_requests): # Adjust remaining budget based on position in queue adjusted_budget = budget_remaining * (1 - i / len(sorted_requests)) # Select optimal model configuration optimal_model = self.select_optimal_model( agent_type=request.get("agent_type", "general"), task_complexity=request.get("complexity", "medium"), current_load=current_load, budget_remaining=adjusted_budget ) # Check cache first cache_key = self.get_cache_key( request.get("task_input", ""), request.get("agent_type", "general"), request.get("complexity", "medium") ) cached_result = self.get_cached_result(cache_key) optimized_request = { **request, "model_config": optimal_model, "cache_key": cache_key, "cached_result": cached_result, "estimated_cost": self.estimate_request_cost( agent_count=request.get("agent_count", 8), complexity=request.get("complexity", "medium") ), "optimization_applied": True } optimized_requests.append(optimized_request) return optimized_requests # Factory function for easy integration def create_hf_pro_optimizer(monthly_budget: float = 100.0) -> HFProOptimizer: """ Create HF Pro optimizer with recommended settings. Args: monthly_budget: Monthly budget in USD Returns: Configured HFProOptimizer instance """ return HFProOptimizer( monthly_budget=monthly_budget, target_cost_per_request=0.05, # 5 cents per Felix request enable_advanced_caching=True, enable_cost_alerts=True ) # Export main classes __all__ = [ 'HFProOptimizer', 'ModelTier', 'ModelConfig', 'ResourceUsageLevel', 'UsageMetrics', 'create_hf_pro_optimizer' ]