felix-framework / config /hf_pro_optimization.py
jkbennitt
Clean hf-space branch and prepare for HuggingFace Spaces deployment
fb867c3
"""
HuggingFace Pro Account Optimization Configuration for Felix Framework
This module provides comprehensive optimization strategies for leveraging HF Pro account
features, ZeroGPU capabilities, and cost-effective deployment while maximizing performance.
Key Features:
- Premium model access with intelligent model selection
- ZeroGPU optimization for cost efficiency
- Advanced caching strategies for reduced compute costs
- Performance monitoring with cost analytics
- Scalable architecture for increased user loads
- Automated resource allocation and optimization
HF Pro Benefits Leveraged:
- Higher concurrent user limits
- Priority access to premium models
- Enhanced ZeroGPU allocation and priority
- Advanced analytics and usage monitoring
- Priority support and faster deployment queues
"""
import os
import json
import time
import asyncio
import logging
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, field
from enum import Enum
from collections import defaultdict, OrderedDict
import numpy as np
from datetime import datetime, timedelta
logger = logging.getLogger(__name__)
class ModelTier(Enum):
"""Model tiers based on HF Pro access and performance."""
PREMIUM_80B = "premium_80b" # Qwen3-Next-80B-A3B series
EFFICIENT_30B = "efficient_30b" # Specialized models
FAST_7B = "fast_7b" # Quick response models
EDGE_1B = "edge_1b" # Ultra-fast edge models
class ResourceUsageLevel(Enum):
"""Resource usage levels for cost optimization."""
MINIMAL = "minimal" # <10% GPU usage
MODERATE = "moderate" # 10-30% GPU usage
STANDARD = "standard" # 30-60% GPU usage
INTENSIVE = "intensive" # 60-80% GPU usage
MAXIMUM = "maximum" # 80%+ GPU usage
@dataclass
class ModelConfig:
"""Configuration for a premium model."""
model_id: str
tier: ModelTier
max_tokens: int = 1024
temperature: float = 0.7
cost_per_token: float = 0.0001
avg_response_time: float = 2.0
quality_score: float = 0.85
supports_zerogpu: bool = True
concurrent_limit: int = 5
@dataclass
class UsageMetrics:
"""Usage and cost metrics tracking."""
total_requests: int = 0
total_tokens: int = 0
total_cost: float = 0.0
avg_response_time: float = 0.0
success_rate: float = 1.0
gpu_utilization: float = 0.0
cache_hit_rate: float = 0.0
concurrent_users: int = 0
peak_concurrent: int = 0
last_reset: datetime = field(default_factory=datetime.now)
class HFProOptimizer:
"""
HuggingFace Pro account optimizer for Felix Framework.
Provides intelligent model selection, cost optimization, and performance
monitoring specifically designed for HF Pro account features.
"""
# Premium model configurations optimized for Felix Framework
PREMIUM_MODELS = {
ModelTier.PREMIUM_80B: [
ModelConfig(
model_id="Qwen/Qwen3-Next-80B-A3B-Instruct",
tier=ModelTier.PREMIUM_80B,
max_tokens=2048,
temperature=0.1,
cost_per_token=0.0002,
avg_response_time=4.5,
quality_score=0.95,
concurrent_limit=3
),
ModelConfig(
model_id="Qwen/Qwen3-Next-80B-A3B-Thinking",
tier=ModelTier.PREMIUM_80B,
max_tokens=1536,
temperature=0.3,
cost_per_token=0.00018,
avg_response_time=3.8,
quality_score=0.93,
concurrent_limit=3
)
],
ModelTier.EFFICIENT_30B: [
ModelConfig(
model_id="Alibaba-NLP/Tongyi-DeepResearch-30B-A3B",
tier=ModelTier.EFFICIENT_30B,
max_tokens=1024,
temperature=0.5,
cost_per_token=0.00012,
avg_response_time=2.5,
quality_score=0.88,
concurrent_limit=5
),
ModelConfig(
model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
tier=ModelTier.EFFICIENT_30B,
max_tokens=1024,
temperature=0.2,
cost_per_token=0.0001,
avg_response_time=2.2,
quality_score=0.86,
concurrent_limit=6
)
],
ModelTier.FAST_7B: [
ModelConfig(
model_id="LLM360/K2-Think",
tier=ModelTier.FAST_7B,
max_tokens=512,
temperature=0.7,
cost_per_token=0.00005,
avg_response_time=1.2,
quality_score=0.82,
concurrent_limit=10
)
],
ModelTier.EDGE_1B: [
ModelConfig(
model_id="facebook/MobileLLM-R1-950M",
tier=ModelTier.EDGE_1B,
max_tokens=256,
temperature=0.8,
cost_per_token=0.00002,
avg_response_time=0.5,
quality_score=0.75,
concurrent_limit=20
)
]
}
# Felix agent type to model tier mapping
AGENT_MODEL_MAPPING = {
"synthesis": ModelTier.PREMIUM_80B, # Highest quality output
"analysis": ModelTier.EFFICIENT_30B, # Balanced performance
"research": ModelTier.FAST_7B, # Quick exploration
"critic": ModelTier.EFFICIENT_30B, # Thorough evaluation
"general": ModelTier.FAST_7B # Default fast processing
}
def __init__(self,
hf_token: Optional[str] = None,
monthly_budget: float = 100.0,
target_cost_per_request: float = 0.05,
enable_advanced_caching: bool = True,
enable_cost_alerts: bool = True):
"""
Initialize HF Pro optimizer.
Args:
hf_token: HuggingFace API token with Pro access
monthly_budget: Monthly budget in USD
target_cost_per_request: Target cost per Felix request
enable_advanced_caching: Enable intelligent caching
enable_cost_alerts: Enable cost monitoring alerts
"""
self.hf_token = hf_token or os.getenv("HF_TOKEN")
self.monthly_budget = monthly_budget
self.target_cost_per_request = target_cost_per_request
self.enable_advanced_caching = enable_advanced_caching
self.enable_cost_alerts = enable_cost_alerts
# Initialize metrics tracking
self.metrics = UsageMetrics()
self.hourly_metrics: Dict[str, UsageMetrics] = defaultdict(UsageMetrics)
self.model_performance: Dict[str, Dict] = defaultdict(dict)
# Advanced caching system
self.cache = OrderedDict() if enable_advanced_caching else None
self.cache_stats = {"hits": 0, "misses": 0, "size": 0}
# Resource monitoring
self.resource_usage = ResourceUsageLevel.MINIMAL
self.concurrent_requests = 0
self.request_queue = asyncio.Queue()
logger.info(f"HF Pro Optimizer initialized - Budget: ${monthly_budget}/month")
def select_optimal_model(self,
agent_type: str,
task_complexity: str,
current_load: int = 0,
budget_remaining: float = 1.0) -> ModelConfig:
"""
Select optimal model based on agent type, complexity, and constraints.
Args:
agent_type: Type of Felix agent requesting model
task_complexity: Complexity level (demo/simple/medium/complex/research)
current_load: Current system load (0-100)
budget_remaining: Remaining budget percentage (0.0-1.0)
Returns:
Optimal ModelConfig for the request
"""
# Get base tier for agent type
base_tier = self.AGENT_MODEL_MAPPING.get(agent_type, ModelTier.FAST_7B)
# Adjust tier based on complexity and constraints
if task_complexity in ["research", "complex"] and budget_remaining > 0.3:
# Use premium models for complex tasks if budget allows
if base_tier in [ModelTier.EFFICIENT_30B, ModelTier.PREMIUM_80B]:
target_tier = ModelTier.PREMIUM_80B
else:
target_tier = ModelTier.EFFICIENT_30B
elif current_load > 70 or budget_remaining < 0.2:
# Use efficient models under high load or low budget
if base_tier == ModelTier.PREMIUM_80B:
target_tier = ModelTier.EFFICIENT_30B
elif base_tier == ModelTier.EFFICIENT_30B:
target_tier = ModelTier.FAST_7B
else:
target_tier = ModelTier.EDGE_1B
else:
target_tier = base_tier
# Select best model from tier
available_models = self.PREMIUM_MODELS.get(target_tier, [])
if not available_models:
# Fallback to fast tier
available_models = self.PREMIUM_MODELS[ModelTier.FAST_7B]
# Select model with best performance/cost ratio for current load
best_model = min(available_models,
key=lambda m: self._calculate_selection_score(m, current_load))
logger.info(f"Selected {best_model.model_id} for {agent_type} agent (complexity: {task_complexity})")
return best_model
def _calculate_selection_score(self, model: ModelConfig, current_load: int) -> float:
"""Calculate model selection score (lower is better)."""
# Base score from cost per token
score = model.cost_per_token * 1000
# Adjust for current load (prefer faster models under high load)
if current_load > 50:
score += model.avg_response_time * 0.5
# Prefer models with higher quality
score -= model.quality_score * 0.2
# Prefer models with higher concurrent limits under load
if current_load > 30:
score -= (model.concurrent_limit / 20) * 0.1
return score
@staticmethod
def create_zerogpu_decorator():
"""Create ZeroGPU decorator for cost-efficient GPU usage."""
try:
import spaces
return spaces.GPU(duration=120) # 2-minute GPU allocation
except ImportError:
logger.warning("ZeroGPU not available - running without GPU optimization")
return lambda x: x
def estimate_request_cost(self,
agent_count: int,
complexity: str,
estimated_tokens_per_agent: int = 300) -> Dict[str, Any]:
"""
Estimate cost for a Felix Framework request.
Args:
agent_count: Number of agents in the request
complexity: Task complexity level
estimated_tokens_per_agent: Estimated tokens per agent
Returns:
Cost estimation with breakdown
"""
total_cost = 0.0
model_breakdown = {}
# Estimate cost for each agent type
agent_types = ["research", "analysis", "synthesis", "critic"]
agents_per_type = agent_count // len(agent_types)
for agent_type in agent_types:
model = self.select_optimal_model(
agent_type=agent_type,
task_complexity=complexity,
budget_remaining=1.0 # Full budget for estimation
)
type_cost = (agents_per_type * estimated_tokens_per_agent *
model.cost_per_token)
total_cost += type_cost
model_breakdown[agent_type] = {
"model_id": model.model_id,
"agents": agents_per_type,
"estimated_tokens": agents_per_type * estimated_tokens_per_agent,
"cost": type_cost
}
return {
"total_estimated_cost": total_cost,
"cost_per_agent": total_cost / agent_count,
"model_breakdown": model_breakdown,
"within_target": total_cost <= self.target_cost_per_request,
"budget_utilization": total_cost / self.target_cost_per_request
}
def get_cache_key(self, task_input: str, agent_type: str, complexity: str) -> str:
"""Generate cache key for task input."""
import hashlib
content = f"{task_input}_{agent_type}_{complexity}"
return hashlib.md5(content.encode()).hexdigest()
def get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
"""Get cached result if available."""
if not self.cache:
return None
if cache_key in self.cache:
# Move to end (LRU)
result = self.cache.pop(cache_key)
self.cache[cache_key] = result
self.cache_stats["hits"] += 1
return result
self.cache_stats["misses"] += 1
return None
def cache_result(self, cache_key: str, result: Dict[str, Any], max_cache_size: int = 1000):
"""Cache a result."""
if not self.cache:
return
# Remove oldest if at capacity
if len(self.cache) >= max_cache_size and cache_key not in self.cache:
self.cache.popitem(last=False)
self.cache[cache_key] = result
self.cache_stats["size"] = len(self.cache)
def update_metrics(self,
model_id: str,
tokens_used: int,
response_time: float,
success: bool,
cost: float):
"""Update usage metrics."""
# Update global metrics
self.metrics.total_requests += 1
self.metrics.total_tokens += tokens_used
self.metrics.total_cost += cost
# Update running averages
self.metrics.avg_response_time = (
(self.metrics.avg_response_time * (self.metrics.total_requests - 1) + response_time) /
self.metrics.total_requests
)
if success:
success_count = self.metrics.total_requests * self.metrics.success_rate
self.metrics.success_rate = (success_count + 1) / self.metrics.total_requests
else:
success_count = self.metrics.total_requests * self.metrics.success_rate
self.metrics.success_rate = success_count / self.metrics.total_requests
# Update hourly metrics
hour_key = datetime.now().strftime("%Y-%m-%d-%H")
hourly = self.hourly_metrics[hour_key]
hourly.total_requests += 1
hourly.total_tokens += tokens_used
hourly.total_cost += cost
# Update model performance tracking
if model_id not in self.model_performance:
self.model_performance[model_id] = {
"requests": 0,
"avg_response_time": 0.0,
"success_rate": 1.0,
"total_cost": 0.0
}
model_stats = self.model_performance[model_id]
model_stats["requests"] += 1
model_stats["avg_response_time"] = (
(model_stats["avg_response_time"] * (model_stats["requests"] - 1) + response_time) /
model_stats["requests"]
)
model_stats["total_cost"] += cost
# Check for cost alerts
if self.enable_cost_alerts:
self._check_cost_alerts()
def _check_cost_alerts(self):
"""Check for cost threshold alerts."""
daily_budget = self.monthly_budget / 30
current_daily_cost = sum(
metrics.total_cost for hour, metrics in self.hourly_metrics.items()
if hour.startswith(datetime.now().strftime("%Y-%m-%d"))
)
if current_daily_cost > daily_budget * 0.8:
logger.warning(f"Daily cost approaching limit: ${current_daily_cost:.2f} / ${daily_budget:.2f}")
if current_daily_cost > daily_budget:
logger.error(f"Daily budget exceeded: ${current_daily_cost:.2f} / ${daily_budget:.2f}")
def get_performance_dashboard(self) -> Dict[str, Any]:
"""Get comprehensive performance dashboard data."""
cache_hit_rate = (
self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"])
if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0
)
return {
"overview": {
"total_requests": self.metrics.total_requests,
"total_cost": self.metrics.total_cost,
"avg_cost_per_request": (
self.metrics.total_cost / self.metrics.total_requests
if self.metrics.total_requests > 0 else 0
),
"success_rate": self.metrics.success_rate,
"avg_response_time": self.metrics.avg_response_time
},
"budget": {
"monthly_budget": self.monthly_budget,
"spent_this_month": self.metrics.total_cost,
"remaining_budget": self.monthly_budget - self.metrics.total_cost,
"burn_rate": self.metrics.total_cost / max(1, (datetime.now().day)),
"projected_monthly": self.metrics.total_cost / max(1, (datetime.now().day)) * 30
},
"performance": {
"cache_hit_rate": cache_hit_rate,
"cache_size": self.cache_stats["size"],
"concurrent_users": self.metrics.concurrent_users,
"peak_concurrent": self.metrics.peak_concurrent
},
"models": {
model_id: {
"requests": stats["requests"],
"avg_response_time": stats["avg_response_time"],
"total_cost": stats["total_cost"],
"cost_per_request": stats["total_cost"] / max(1, stats["requests"])
}
for model_id, stats in self.model_performance.items()
},
"optimization_suggestions": self._get_optimization_suggestions()
}
def _get_optimization_suggestions(self) -> List[str]:
"""Generate optimization suggestions based on usage patterns."""
suggestions = []
# Cache efficiency
cache_hit_rate = (
self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"])
if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0
)
if cache_hit_rate < 0.3:
suggestions.append("Consider increasing cache size or improving cache key strategy")
# Cost efficiency
avg_cost = (
self.metrics.total_cost / self.metrics.total_requests
if self.metrics.total_requests > 0 else 0
)
if avg_cost > self.target_cost_per_request * 1.2:
suggestions.append("Consider using more efficient models for routine tasks")
# Performance optimization
if self.metrics.avg_response_time > 5.0:
suggestions.append("Consider using faster models or reducing complexity for real-time tasks")
# Budget management
if self.metrics.total_cost > self.monthly_budget * 0.8:
suggestions.append("Approaching monthly budget limit - consider cost controls")
return suggestions
async def optimize_request_flow(self,
task_requests: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Optimize a batch of Felix Framework requests for cost and performance.
Args:
task_requests: List of task request dictionaries
Returns:
Optimized request configurations
"""
optimized_requests = []
# Sort requests by priority and complexity
sorted_requests = sorted(task_requests,
key=lambda x: (x.get("priority", 5), x.get("complexity", "medium")))
current_load = len(sorted_requests)
budget_remaining = (
(self.monthly_budget - self.metrics.total_cost) / self.monthly_budget
)
for i, request in enumerate(sorted_requests):
# Adjust remaining budget based on position in queue
adjusted_budget = budget_remaining * (1 - i / len(sorted_requests))
# Select optimal model configuration
optimal_model = self.select_optimal_model(
agent_type=request.get("agent_type", "general"),
task_complexity=request.get("complexity", "medium"),
current_load=current_load,
budget_remaining=adjusted_budget
)
# Check cache first
cache_key = self.get_cache_key(
request.get("task_input", ""),
request.get("agent_type", "general"),
request.get("complexity", "medium")
)
cached_result = self.get_cached_result(cache_key)
optimized_request = {
**request,
"model_config": optimal_model,
"cache_key": cache_key,
"cached_result": cached_result,
"estimated_cost": self.estimate_request_cost(
agent_count=request.get("agent_count", 8),
complexity=request.get("complexity", "medium")
),
"optimization_applied": True
}
optimized_requests.append(optimized_request)
return optimized_requests
# Factory function for easy integration
def create_hf_pro_optimizer(monthly_budget: float = 100.0) -> HFProOptimizer:
"""
Create HF Pro optimizer with recommended settings.
Args:
monthly_budget: Monthly budget in USD
Returns:
Configured HFProOptimizer instance
"""
return HFProOptimizer(
monthly_budget=monthly_budget,
target_cost_per_request=0.05, # 5 cents per Felix request
enable_advanced_caching=True,
enable_cost_alerts=True
)
# Export main classes
__all__ = [
'HFProOptimizer',
'ModelTier',
'ModelConfig',
'ResourceUsageLevel',
'UsageMetrics',
'create_hf_pro_optimizer'
]