Spaces:

jkbennitt
/

felix-framework

Paused

felix-framework / config /hf_pro_optimization.py

jkbennitt

Clean hf-space branch and prepare for HuggingFace Spaces deployment

fb867c3 5 months ago

22.2 kB

	"""
	HuggingFace Pro Account Optimization Configuration for Felix Framework

	This module provides comprehensive optimization strategies for leveraging HF Pro account
	features, ZeroGPU capabilities, and cost-effective deployment while maximizing performance.

	Key Features:
	- Premium model access with intelligent model selection
	- ZeroGPU optimization for cost efficiency
	- Advanced caching strategies for reduced compute costs
	- Performance monitoring with cost analytics
	- Scalable architecture for increased user loads
	- Automated resource allocation and optimization

	HF Pro Benefits Leveraged:
	- Higher concurrent user limits
	- Priority access to premium models
	- Enhanced ZeroGPU allocation and priority
	- Advanced analytics and usage monitoring
	- Priority support and faster deployment queues
	"""

	import os
	import json
	import time
	import asyncio
	import logging
	from typing import Dict, List, Optional, Any, Tuple
	from dataclasses import dataclass, field
	from enum import Enum
	from collections import defaultdict, OrderedDict
	import numpy as np
	from datetime import datetime, timedelta

	logger = logging.getLogger(__name__)


	class ModelTier(Enum):
	"""Model tiers based on HF Pro access and performance."""
	PREMIUM_80B = "premium_80b" # Qwen3-Next-80B-A3B series
	EFFICIENT_30B = "efficient_30b" # Specialized models
	FAST_7B = "fast_7b" # Quick response models
	EDGE_1B = "edge_1b" # Ultra-fast edge models


	class ResourceUsageLevel(Enum):
	"""Resource usage levels for cost optimization."""
	MINIMAL = "minimal" # <10% GPU usage
	MODERATE = "moderate" # 10-30% GPU usage
	STANDARD = "standard" # 30-60% GPU usage
	INTENSIVE = "intensive" # 60-80% GPU usage
	MAXIMUM = "maximum" # 80%+ GPU usage


	@dataclass
	class ModelConfig:
	"""Configuration for a premium model."""
	model_id: str
	tier: ModelTier
	max_tokens: int = 1024
	temperature: float = 0.7
	cost_per_token: float = 0.0001
	avg_response_time: float = 2.0
	quality_score: float = 0.85
	supports_zerogpu: bool = True
	concurrent_limit: int = 5


	@dataclass
	class UsageMetrics:
	"""Usage and cost metrics tracking."""
	total_requests: int = 0
	total_tokens: int = 0
	total_cost: float = 0.0
	avg_response_time: float = 0.0
	success_rate: float = 1.0
	gpu_utilization: float = 0.0
	cache_hit_rate: float = 0.0
	concurrent_users: int = 0
	peak_concurrent: int = 0
	last_reset: datetime = field(default_factory=datetime.now)


	class HFProOptimizer:
	"""
	HuggingFace Pro account optimizer for Felix Framework.

	Provides intelligent model selection, cost optimization, and performance
	monitoring specifically designed for HF Pro account features.
	"""

	# Premium model configurations optimized for Felix Framework
	PREMIUM_MODELS = {
	ModelTier.PREMIUM_80B: [
	ModelConfig(
	model_id="Qwen/Qwen3-Next-80B-A3B-Instruct",
	tier=ModelTier.PREMIUM_80B,
	max_tokens=2048,
	temperature=0.1,
	cost_per_token=0.0002,
	avg_response_time=4.5,
	quality_score=0.95,
	concurrent_limit=3
	),
	ModelConfig(
	model_id="Qwen/Qwen3-Next-80B-A3B-Thinking",
	tier=ModelTier.PREMIUM_80B,
	max_tokens=1536,
	temperature=0.3,
	cost_per_token=0.00018,
	avg_response_time=3.8,
	quality_score=0.93,
	concurrent_limit=3
	)
	],
	ModelTier.EFFICIENT_30B: [
	ModelConfig(
	model_id="Alibaba-NLP/Tongyi-DeepResearch-30B-A3B",
	tier=ModelTier.EFFICIENT_30B,
	max_tokens=1024,
	temperature=0.5,
	cost_per_token=0.00012,
	avg_response_time=2.5,
	quality_score=0.88,
	concurrent_limit=5
	),
	ModelConfig(
	model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
	tier=ModelTier.EFFICIENT_30B,
	max_tokens=1024,
	temperature=0.2,
	cost_per_token=0.0001,
	avg_response_time=2.2,
	quality_score=0.86,
	concurrent_limit=6
	)
	],
	ModelTier.FAST_7B: [
	ModelConfig(
	model_id="LLM360/K2-Think",
	tier=ModelTier.FAST_7B,
	max_tokens=512,
	temperature=0.7,
	cost_per_token=0.00005,
	avg_response_time=1.2,
	quality_score=0.82,
	concurrent_limit=10
	)
	],
	ModelTier.EDGE_1B: [
	ModelConfig(
	model_id="facebook/MobileLLM-R1-950M",
	tier=ModelTier.EDGE_1B,
	max_tokens=256,
	temperature=0.8,
	cost_per_token=0.00002,
	avg_response_time=0.5,
	quality_score=0.75,
	concurrent_limit=20
	)
	]
	}

	# Felix agent type to model tier mapping
	AGENT_MODEL_MAPPING = {
	"synthesis": ModelTier.PREMIUM_80B, # Highest quality output
	"analysis": ModelTier.EFFICIENT_30B, # Balanced performance
	"research": ModelTier.FAST_7B, # Quick exploration
	"critic": ModelTier.EFFICIENT_30B, # Thorough evaluation
	"general": ModelTier.FAST_7B # Default fast processing
	}

	def __init__(self,
	hf_token: Optional[str] = None,
	monthly_budget: float = 100.0,
	target_cost_per_request: float = 0.05,
	enable_advanced_caching: bool = True,
	enable_cost_alerts: bool = True):
	"""
	Initialize HF Pro optimizer.

	Args:
	hf_token: HuggingFace API token with Pro access
	monthly_budget: Monthly budget in USD
	target_cost_per_request: Target cost per Felix request
	enable_advanced_caching: Enable intelligent caching
	enable_cost_alerts: Enable cost monitoring alerts
	"""
	self.hf_token = hf_token or os.getenv("HF_TOKEN")
	self.monthly_budget = monthly_budget
	self.target_cost_per_request = target_cost_per_request
	self.enable_advanced_caching = enable_advanced_caching
	self.enable_cost_alerts = enable_cost_alerts

	# Initialize metrics tracking
	self.metrics = UsageMetrics()
	self.hourly_metrics: Dict[str, UsageMetrics] = defaultdict(UsageMetrics)
	self.model_performance: Dict[str, Dict] = defaultdict(dict)

	# Advanced caching system
	self.cache = OrderedDict() if enable_advanced_caching else None
	self.cache_stats = {"hits": 0, "misses": 0, "size": 0}

	# Resource monitoring
	self.resource_usage = ResourceUsageLevel.MINIMAL
	self.concurrent_requests = 0
	self.request_queue = asyncio.Queue()

	logger.info(f"HF Pro Optimizer initialized - Budget: ${monthly_budget}/month")

	def select_optimal_model(self,
	agent_type: str,
	task_complexity: str,
	current_load: int = 0,
	budget_remaining: float = 1.0) -> ModelConfig:
	"""
	Select optimal model based on agent type, complexity, and constraints.

	Args:
	agent_type: Type of Felix agent requesting model
	task_complexity: Complexity level (demo/simple/medium/complex/research)
	current_load: Current system load (0-100)
	budget_remaining: Remaining budget percentage (0.0-1.0)

	Returns:
	Optimal ModelConfig for the request
	"""
	# Get base tier for agent type
	base_tier = self.AGENT_MODEL_MAPPING.get(agent_type, ModelTier.FAST_7B)

	# Adjust tier based on complexity and constraints
	if task_complexity in ["research", "complex"] and budget_remaining > 0.3:
	# Use premium models for complex tasks if budget allows
	if base_tier in [ModelTier.EFFICIENT_30B, ModelTier.PREMIUM_80B]:
	target_tier = ModelTier.PREMIUM_80B
	else:
	target_tier = ModelTier.EFFICIENT_30B
	elif current_load > 70 or budget_remaining < 0.2:
	# Use efficient models under high load or low budget
	if base_tier == ModelTier.PREMIUM_80B:
	target_tier = ModelTier.EFFICIENT_30B
	elif base_tier == ModelTier.EFFICIENT_30B:
	target_tier = ModelTier.FAST_7B
	else:
	target_tier = ModelTier.EDGE_1B
	else:
	target_tier = base_tier

	# Select best model from tier
	available_models = self.PREMIUM_MODELS.get(target_tier, [])
	if not available_models:
	# Fallback to fast tier
	available_models = self.PREMIUM_MODELS[ModelTier.FAST_7B]

	# Select model with best performance/cost ratio for current load
	best_model = min(available_models,
	key=lambda m: self._calculate_selection_score(m, current_load))

	logger.info(f"Selected {best_model.model_id} for {agent_type} agent (complexity: {task_complexity})")
	return best_model

	def _calculate_selection_score(self, model: ModelConfig, current_load: int) -> float:
	"""Calculate model selection score (lower is better)."""
	# Base score from cost per token
	score = model.cost_per_token * 1000

	# Adjust for current load (prefer faster models under high load)
	if current_load > 50:
	score += model.avg_response_time * 0.5

	# Prefer models with higher quality
	score -= model.quality_score * 0.2

	# Prefer models with higher concurrent limits under load
	if current_load > 30:
	score -= (model.concurrent_limit / 20) * 0.1

	return score

	@staticmethod
	def create_zerogpu_decorator():
	"""Create ZeroGPU decorator for cost-efficient GPU usage."""
	try:
	import spaces
	return spaces.GPU(duration=120) # 2-minute GPU allocation
	except ImportError:
	logger.warning("ZeroGPU not available - running without GPU optimization")
	return lambda x: x

	def estimate_request_cost(self,
	agent_count: int,
	complexity: str,
	estimated_tokens_per_agent: int = 300) -> Dict[str, Any]:
	"""
	Estimate cost for a Felix Framework request.

	Args:
	agent_count: Number of agents in the request
	complexity: Task complexity level
	estimated_tokens_per_agent: Estimated tokens per agent

	Returns:
	Cost estimation with breakdown
	"""
	total_cost = 0.0
	model_breakdown = {}

	# Estimate cost for each agent type
	agent_types = ["research", "analysis", "synthesis", "critic"]
	agents_per_type = agent_count // len(agent_types)

	for agent_type in agent_types:
	model = self.select_optimal_model(
	agent_type=agent_type,
	task_complexity=complexity,
	budget_remaining=1.0 # Full budget for estimation
	)

	type_cost = (agents_per_type * estimated_tokens_per_agent *
	model.cost_per_token)
	total_cost += type_cost

	model_breakdown[agent_type] = {
	"model_id": model.model_id,
	"agents": agents_per_type,
	"estimated_tokens": agents_per_type * estimated_tokens_per_agent,
	"cost": type_cost
	}

	return {
	"total_estimated_cost": total_cost,
	"cost_per_agent": total_cost / agent_count,
	"model_breakdown": model_breakdown,
	"within_target": total_cost <= self.target_cost_per_request,
	"budget_utilization": total_cost / self.target_cost_per_request
	}

	def get_cache_key(self, task_input: str, agent_type: str, complexity: str) -> str:
	"""Generate cache key for task input."""
	import hashlib
	content = f"{task_input}_{agent_type}_{complexity}"
	return hashlib.md5(content.encode()).hexdigest()

	def get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
	"""Get cached result if available."""
	if not self.cache:
	return None

	if cache_key in self.cache:
	# Move to end (LRU)
	result = self.cache.pop(cache_key)
	self.cache[cache_key] = result
	self.cache_stats["hits"] += 1
	return result

	self.cache_stats["misses"] += 1
	return None

	def cache_result(self, cache_key: str, result: Dict[str, Any], max_cache_size: int = 1000):
	"""Cache a result."""
	if not self.cache:
	return

	# Remove oldest if at capacity
	if len(self.cache) >= max_cache_size and cache_key not in self.cache:
	self.cache.popitem(last=False)

	self.cache[cache_key] = result
	self.cache_stats["size"] = len(self.cache)

	def update_metrics(self,
	model_id: str,
	tokens_used: int,
	response_time: float,
	success: bool,
	cost: float):
	"""Update usage metrics."""
	# Update global metrics
	self.metrics.total_requests += 1
	self.metrics.total_tokens += tokens_used
	self.metrics.total_cost += cost

	# Update running averages
	self.metrics.avg_response_time = (
	(self.metrics.avg_response_time * (self.metrics.total_requests - 1) + response_time) /
	self.metrics.total_requests
	)

	if success:
	success_count = self.metrics.total_requests * self.metrics.success_rate
	self.metrics.success_rate = (success_count + 1) / self.metrics.total_requests
	else:
	success_count = self.metrics.total_requests * self.metrics.success_rate
	self.metrics.success_rate = success_count / self.metrics.total_requests

	# Update hourly metrics
	hour_key = datetime.now().strftime("%Y-%m-%d-%H")
	hourly = self.hourly_metrics[hour_key]
	hourly.total_requests += 1
	hourly.total_tokens += tokens_used
	hourly.total_cost += cost

	# Update model performance tracking
	if model_id not in self.model_performance:
	self.model_performance[model_id] = {
	"requests": 0,
	"avg_response_time": 0.0,
	"success_rate": 1.0,
	"total_cost": 0.0
	}

	model_stats = self.model_performance[model_id]
	model_stats["requests"] += 1
	model_stats["avg_response_time"] = (
	(model_stats["avg_response_time"] * (model_stats["requests"] - 1) + response_time) /
	model_stats["requests"]
	)
	model_stats["total_cost"] += cost

	# Check for cost alerts
	if self.enable_cost_alerts:
	self._check_cost_alerts()

	def _check_cost_alerts(self):
	"""Check for cost threshold alerts."""
	daily_budget = self.monthly_budget / 30
	current_daily_cost = sum(
	metrics.total_cost for hour, metrics in self.hourly_metrics.items()
	if hour.startswith(datetime.now().strftime("%Y-%m-%d"))
	)

	if current_daily_cost > daily_budget * 0.8:
	logger.warning(f"Daily cost approaching limit: ${current_daily_cost:.2f} / ${daily_budget:.2f}")

	if current_daily_cost > daily_budget:
	logger.error(f"Daily budget exceeded: ${current_daily_cost:.2f} / ${daily_budget:.2f}")

	def get_performance_dashboard(self) -> Dict[str, Any]:
	"""Get comprehensive performance dashboard data."""
	cache_hit_rate = (
	self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"])
	if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0
	)

	return {
	"overview": {
	"total_requests": self.metrics.total_requests,
	"total_cost": self.metrics.total_cost,
	"avg_cost_per_request": (
	self.metrics.total_cost / self.metrics.total_requests
	if self.metrics.total_requests > 0 else 0
	),
	"success_rate": self.metrics.success_rate,
	"avg_response_time": self.metrics.avg_response_time
	},
	"budget": {
	"monthly_budget": self.monthly_budget,
	"spent_this_month": self.metrics.total_cost,
	"remaining_budget": self.monthly_budget - self.metrics.total_cost,
	"burn_rate": self.metrics.total_cost / max(1, (datetime.now().day)),
	"projected_monthly": self.metrics.total_cost / max(1, (datetime.now().day)) * 30
	},
	"performance": {
	"cache_hit_rate": cache_hit_rate,
	"cache_size": self.cache_stats["size"],
	"concurrent_users": self.metrics.concurrent_users,
	"peak_concurrent": self.metrics.peak_concurrent
	},
	"models": {
	model_id: {
	"requests": stats["requests"],
	"avg_response_time": stats["avg_response_time"],
	"total_cost": stats["total_cost"],
	"cost_per_request": stats["total_cost"] / max(1, stats["requests"])
	}
	for model_id, stats in self.model_performance.items()
	},
	"optimization_suggestions": self._get_optimization_suggestions()
	}

	def _get_optimization_suggestions(self) -> List[str]:
	"""Generate optimization suggestions based on usage patterns."""
	suggestions = []

	# Cache efficiency
	cache_hit_rate = (
	self.cache_stats["hits"] / (self.cache_stats["hits"] + self.cache_stats["misses"])
	if (self.cache_stats["hits"] + self.cache_stats["misses"]) > 0 else 0
	)

	if cache_hit_rate < 0.3:
	suggestions.append("Consider increasing cache size or improving cache key strategy")

	# Cost efficiency
	avg_cost = (
	self.metrics.total_cost / self.metrics.total_requests
	if self.metrics.total_requests > 0 else 0
	)

	if avg_cost > self.target_cost_per_request * 1.2:
	suggestions.append("Consider using more efficient models for routine tasks")

	# Performance optimization
	if self.metrics.avg_response_time > 5.0:
	suggestions.append("Consider using faster models or reducing complexity for real-time tasks")

	# Budget management
	if self.metrics.total_cost > self.monthly_budget * 0.8:
	suggestions.append("Approaching monthly budget limit - consider cost controls")

	return suggestions

	async def optimize_request_flow(self,
	task_requests: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
	"""
	Optimize a batch of Felix Framework requests for cost and performance.

	Args:
	task_requests: List of task request dictionaries

	Returns:
	Optimized request configurations
	"""
	optimized_requests = []

	# Sort requests by priority and complexity
	sorted_requests = sorted(task_requests,
	key=lambda x: (x.get("priority", 5), x.get("complexity", "medium")))

	current_load = len(sorted_requests)
	budget_remaining = (
	(self.monthly_budget - self.metrics.total_cost) / self.monthly_budget
	)

	for i, request in enumerate(sorted_requests):
	# Adjust remaining budget based on position in queue
	adjusted_budget = budget_remaining * (1 - i / len(sorted_requests))

	# Select optimal model configuration
	optimal_model = self.select_optimal_model(
	agent_type=request.get("agent_type", "general"),
	task_complexity=request.get("complexity", "medium"),
	current_load=current_load,
	budget_remaining=adjusted_budget
	)

	# Check cache first
	cache_key = self.get_cache_key(
	request.get("task_input", ""),
	request.get("agent_type", "general"),
	request.get("complexity", "medium")
	)

	cached_result = self.get_cached_result(cache_key)

	optimized_request = {
	**request,
	"model_config": optimal_model,
	"cache_key": cache_key,
	"cached_result": cached_result,
	"estimated_cost": self.estimate_request_cost(
	agent_count=request.get("agent_count", 8),
	complexity=request.get("complexity", "medium")
	),
	"optimization_applied": True
	}

	optimized_requests.append(optimized_request)

	return optimized_requests


	# Factory function for easy integration
	def create_hf_pro_optimizer(monthly_budget: float = 100.0) -> HFProOptimizer:
	"""
	Create HF Pro optimizer with recommended settings.

	Args:
	monthly_budget: Monthly budget in USD

	Returns:
	Configured HFProOptimizer instance
	"""
	return HFProOptimizer(
	monthly_budget=monthly_budget,
	target_cost_per_request=0.05, # 5 cents per Felix request
	enable_advanced_caching=True,
	enable_cost_alerts=True
	)


	# Export main classes
	__all__ = [
	'HFProOptimizer',
	'ModelTier',
	'ModelConfig',
	'ResourceUsageLevel',
	'UsageMetrics',
	'create_hf_pro_optimizer'
	]