felix-framework / config /premium_model_config.py
jkbennitt
Clean hf-space branch and prepare for HuggingFace Spaces deployment
fb867c3
"""
Premium Model Configuration for Felix Framework HF Pro Deployment
This module provides intelligent model selection and configuration optimized for
HuggingFace Pro accounts, ZeroGPU capabilities, and cost-effective deployment.
Features:
- Premium model access with Pro account benefits
- Intelligent model routing based on task complexity
- Cost optimization with performance balancing
- ZeroGPU memory management and batch processing
- Fallback chains for high availability
- Performance monitoring and adaptive selection
"""
import os
import json
import logging
import asyncio
import time
from typing import Dict, List, Optional, Any, Tuple, Union
from dataclasses import dataclass, field
from enum import Enum
from datetime import datetime, timedelta
import numpy as np
from .hf_pro_optimization import ModelTier, ModelConfig, HFProOptimizer
logger = logging.getLogger(__name__)
class TaskComplexity(Enum):
"""Task complexity levels for model selection."""
SIMPLE = "simple" # Quick responses, basic processing
MODERATE = "moderate" # Standard analysis and reasoning
COMPLEX = "complex" # Deep analysis, multi-step reasoning
RESEARCH = "research" # Comprehensive research and synthesis
CREATIVE = "creative" # Creative writing and ideation
class ModelPerformanceRating(Enum):
"""Model performance ratings based on benchmarks."""
EXCELLENT = "excellent" # 90%+ benchmark scores
GOOD = "good" # 80-90% benchmark scores
MODERATE = "moderate" # 70-80% benchmark scores
BASIC = "basic" # 60-70% benchmark scores
@dataclass
class PremiumModelEntry:
"""Enhanced model configuration with Pro account features."""
model_id: str
tier: ModelTier
performance_rating: ModelPerformanceRating
max_tokens: int = 2048
temperature_range: Tuple[float, float] = (0.1, 0.9)
cost_per_1k_tokens: float = 0.10
avg_response_time: float = 2.0
context_window: int = 4096
supports_zerogpu: bool = True
supports_batching: bool = True
concurrent_limit: int = 5
memory_requirement_gb: float = 8.0
specialties: List[str] = field(default_factory=list)
benchmarks: Dict[str, float] = field(default_factory=dict)
pro_exclusive: bool = False
fallback_models: List[str] = field(default_factory=list)
class PremiumModelManager:
"""
Manages premium model access and intelligent selection for Felix Framework.
Optimized for HuggingFace Pro accounts with advanced model routing,
cost optimization, and performance monitoring.
"""
# Premium model catalog with HF Pro exclusive models
PREMIUM_MODEL_CATALOG = {
# Ultra-premium 80B+ models (Pro exclusive)
"qwen3-next-80b-instruct": PremiumModelEntry(
model_id="Qwen/Qwen3-Next-80B-A3B-Instruct",
tier=ModelTier.PREMIUM_80B,
performance_rating=ModelPerformanceRating.EXCELLENT,
max_tokens=4096,
temperature_range=(0.1, 0.8),
cost_per_1k_tokens=0.20,
avg_response_time=5.0,
context_window=32768,
memory_requirement_gb=40.0,
specialties=["reasoning", "analysis", "complex_qa"],
benchmarks={"mmlu": 0.89, "hellaswag": 0.92, "arc": 0.88},
pro_exclusive=True,
fallback_models=["Qwen/Qwen3-Coder-30B-A3B-Instruct"]
),
"qwen3-next-80b-thinking": PremiumModelEntry(
model_id="Qwen/Qwen3-Next-80B-A3B-Thinking",
tier=ModelTier.PREMIUM_80B,
performance_rating=ModelPerformanceRating.EXCELLENT,
max_tokens=3072,
temperature_range=(0.2, 0.7),
cost_per_1k_tokens=0.18,
avg_response_time=4.5,
context_window=32768,
memory_requirement_gb=40.0,
specialties=["reasoning", "step_by_step", "problem_solving"],
benchmarks={"gsm8k": 0.94, "math": 0.76, "reasoning": 0.91},
pro_exclusive=True,
fallback_models=["Alibaba-NLP/Tongyi-DeepResearch-30B-A3B"]
),
# High-performance 30B models
"tongyi-deepresearch-30b": PremiumModelEntry(
model_id="Alibaba-NLP/Tongyi-DeepResearch-30B-A3B",
tier=ModelTier.EFFICIENT_30B,
performance_rating=ModelPerformanceRating.GOOD,
max_tokens=2048,
temperature_range=(0.1, 0.8),
cost_per_1k_tokens=0.12,
avg_response_time=3.0,
context_window=16384,
memory_requirement_gb=15.0,
specialties=["research", "analysis", "synthesis"],
benchmarks={"mmlu": 0.84, "hellaswag": 0.87, "arc": 0.82},
fallback_models=["Qwen/Qwen3-Coder-30B-A3B-Instruct"]
),
"qwen3-coder-30b": PremiumModelEntry(
model_id="Qwen/Qwen3-Coder-30B-A3B-Instruct",
tier=ModelTier.EFFICIENT_30B,
performance_rating=ModelPerformanceRating.GOOD,
max_tokens=2048,
temperature_range=(0.1, 0.6),
cost_per_1k_tokens=0.10,
avg_response_time=2.5,
context_window=16384,
memory_requirement_gb=15.0,
specialties=["coding", "technical_analysis", "structured_output"],
benchmarks={"humaneval": 0.78, "mbpp": 0.75, "code_quality": 0.85},
fallback_models=["LLM360/K2-Think"]
),
"ernie-4.5-21b-thinking": PremiumModelEntry(
model_id="baidu/ERNIE-4.5-21B-A3B-Thinking",
tier=ModelTier.EFFICIENT_30B,
performance_rating=ModelPerformanceRating.GOOD,
max_tokens=1536,
temperature_range=(0.2, 0.7),
cost_per_1k_tokens=0.08,
avg_response_time=2.2,
context_window=8192,
memory_requirement_gb=12.0,
specialties=["reasoning", "multilingual", "thinking"],
benchmarks={"c_eval": 0.86, "reasoning": 0.83, "multilingual": 0.89},
fallback_models=["LLM360/K2-Think"]
),
# Efficient 7B-13B models
"k2-think": PremiumModelEntry(
model_id="LLM360/K2-Think",
tier=ModelTier.FAST_7B,
performance_rating=ModelPerformanceRating.GOOD,
max_tokens=1024,
temperature_range=(0.3, 0.8),
cost_per_1k_tokens=0.05,
avg_response_time=1.5,
context_window=8192,
memory_requirement_gb=7.0,
specialties=["fast_reasoning", "balanced_performance"],
benchmarks={"mmlu": 0.78, "hellaswag": 0.82, "speed": 0.95},
fallback_models=["facebook/MobileLLM-R1-950M"]
),
"llama-3.1-8b-instruct": PremiumModelEntry(
model_id="meta-llama/Llama-3.1-8B-Instruct",
tier=ModelTier.FAST_7B,
performance_rating=ModelPerformanceRating.GOOD,
max_tokens=1024,
temperature_range=(0.1, 0.9),
cost_per_1k_tokens=0.06,
avg_response_time=1.8,
context_window=8192,
memory_requirement_gb=8.0,
specialties=["general_purpose", "instruction_following"],
benchmarks={"mmlu": 0.82, "instruction_following": 0.88},
fallback_models=["facebook/MobileLLM-R1-950M"]
),
# Edge models for fast responses
"mobile-llm-950m": PremiumModelEntry(
model_id="facebook/MobileLLM-R1-950M",
tier=ModelTier.EDGE_1B,
performance_rating=ModelPerformanceRating.MODERATE,
max_tokens=512,
temperature_range=(0.5, 0.9),
cost_per_1k_tokens=0.02,
avg_response_time=0.8,
context_window=2048,
memory_requirement_gb=2.0,
specialties=["fast_response", "edge_computing", "mobile"],
benchmarks={"speed": 0.98, "efficiency": 0.95, "basic_qa": 0.72},
fallback_models=[]
),
"ring-mini-2.0": PremiumModelEntry(
model_id="inclusionAI/Ring-mini-2.0",
tier=ModelTier.EDGE_1B,
performance_rating=ModelPerformanceRating.MODERATE,
max_tokens=512,
temperature_range=(0.4, 0.8),
cost_per_1k_tokens=0.03,
avg_response_time=1.0,
context_window=4096,
memory_requirement_gb=3.0,
specialties=["multilingual", "fast_processing"],
benchmarks={"multilingual": 0.78, "speed": 0.90, "basic_reasoning": 0.70},
fallback_models=["facebook/MobileLLM-R1-950M"]
)
}
# Agent type to model selection strategy
AGENT_MODEL_STRATEGIES = {
"research": {
"preferred_tiers": [ModelTier.FAST_7B, ModelTier.EFFICIENT_30B],
"preferred_specialties": ["research", "fast_reasoning", "general_purpose"],
"max_cost_per_request": 0.15,
"min_performance_rating": ModelPerformanceRating.MODERATE
},
"analysis": {
"preferred_tiers": [ModelTier.EFFICIENT_30B, ModelTier.PREMIUM_80B],
"preferred_specialties": ["reasoning", "analysis", "step_by_step"],
"max_cost_per_request": 0.25,
"min_performance_rating": ModelPerformanceRating.GOOD
},
"synthesis": {
"preferred_tiers": [ModelTier.PREMIUM_80B, ModelTier.EFFICIENT_30B],
"preferred_specialties": ["synthesis", "reasoning", "complex_qa"],
"max_cost_per_request": 0.35,
"min_performance_rating": ModelPerformanceRating.GOOD
},
"critic": {
"preferred_tiers": [ModelTier.EFFICIENT_30B, ModelTier.FAST_7B],
"preferred_specialties": ["reasoning", "analysis", "thinking"],
"max_cost_per_request": 0.20,
"min_performance_rating": ModelPerformanceRating.GOOD
},
"general": {
"preferred_tiers": [ModelTier.FAST_7B, ModelTier.EDGE_1B],
"preferred_specialties": ["general_purpose", "fast_response", "balanced_performance"],
"max_cost_per_request": 0.10,
"min_performance_rating": ModelPerformanceRating.MODERATE
}
}
def __init__(self,
hf_pro_optimizer: Optional[HFProOptimizer] = None,
enable_adaptive_selection: bool = True,
enable_cost_optimization: bool = True,
enable_performance_tracking: bool = True):
"""
Initialize premium model manager.
Args:
hf_pro_optimizer: HF Pro optimizer for cost management
enable_adaptive_selection: Enable adaptive model selection based on performance
enable_cost_optimization: Enable cost-based model optimization
enable_performance_tracking: Enable model performance tracking
"""
self.hf_pro_optimizer = hf_pro_optimizer
self.enable_adaptive_selection = enable_adaptive_selection
self.enable_cost_optimization = enable_cost_optimization
self.enable_performance_tracking = enable_performance_tracking
# Performance tracking
self.model_performance_history = {}
self.selection_history = []
self.cost_tracking = {}
# Adaptive selection weights
self.performance_weights = {
"response_time": 0.3,
"quality_score": 0.4,
"cost_efficiency": 0.2,
"success_rate": 0.1
}
logger.info("Premium Model Manager initialized")
def select_optimal_model(self,
agent_type: str,
task_complexity: TaskComplexity,
budget_constraint: Optional[float] = None,
performance_priority: float = 0.5,
speed_priority: float = 0.3,
cost_priority: float = 0.2,
context_length_needed: int = 2048,
gpu_memory_available: float = 16.0) -> PremiumModelEntry:
"""
Select optimal model based on comprehensive criteria.
Args:
agent_type: Type of Felix agent (research, analysis, synthesis, critic, general)
task_complexity: Complexity level of the task
budget_constraint: Maximum cost per request
performance_priority: Weight for performance in selection (0-1)
speed_priority: Weight for speed in selection (0-1)
cost_priority: Weight for cost in selection (0-1)
context_length_needed: Required context window size
gpu_memory_available: Available GPU memory in GB
Returns:
Selected premium model configuration
"""
# Normalize priorities
total_priority = performance_priority + speed_priority + cost_priority
if total_priority > 0:
performance_priority /= total_priority
speed_priority /= total_priority
cost_priority /= total_priority
# Get agent strategy
strategy = self.AGENT_MODEL_STRATEGIES.get(agent_type, self.AGENT_MODEL_STRATEGIES["general"])
# Filter models by constraints
candidate_models = self._filter_models_by_constraints(
strategy=strategy,
task_complexity=task_complexity,
budget_constraint=budget_constraint,
context_length_needed=context_length_needed,
gpu_memory_available=gpu_memory_available
)
if not candidate_models:
# Fallback to basic model
logger.warning(f"No models match constraints for {agent_type}, using fallback")
return self.PREMIUM_MODEL_CATALOG["mobile-llm-950m"]
# Score and rank models
scored_models = []
for model in candidate_models:
score = self._calculate_model_score(
model=model,
task_complexity=task_complexity,
performance_priority=performance_priority,
speed_priority=speed_priority,
cost_priority=cost_priority
)
scored_models.append((score, model))
# Sort by score (higher is better)
scored_models.sort(key=lambda x: x[0], reverse=True)
selected_model = scored_models[0][1]
# Track selection
self._track_selection(agent_type, task_complexity, selected_model, scored_models[0][0])
logger.info(f"Selected {selected_model.model_id} for {agent_type} agent (score: {scored_models[0][0]:.3f})")
return selected_model
def _filter_models_by_constraints(self,
strategy: Dict[str, Any],
task_complexity: TaskComplexity,
budget_constraint: Optional[float],
context_length_needed: int,
gpu_memory_available: float) -> List[PremiumModelEntry]:
"""Filter models by hard constraints."""
candidates = []
for model in self.PREMIUM_MODEL_CATALOG.values():
# Check tier preference
if model.tier not in strategy["preferred_tiers"]:
continue
# Check performance rating
if model.performance_rating.value < strategy["min_performance_rating"].value:
continue
# Check budget constraint
max_cost = budget_constraint or strategy["max_cost_per_request"]
estimated_cost = (model.max_tokens / 1000) * model.cost_per_1k_tokens
if estimated_cost > max_cost:
continue
# Check context window
if model.context_window < context_length_needed:
continue
# Check GPU memory requirement
if model.memory_requirement_gb > gpu_memory_available:
continue
# Check complexity alignment
if task_complexity == TaskComplexity.SIMPLE and model.tier == ModelTier.PREMIUM_80B:
continue # Don't use premium models for simple tasks
elif task_complexity == TaskComplexity.RESEARCH and model.tier == ModelTier.EDGE_1B:
continue # Don't use edge models for research tasks
candidates.append(model)
return candidates
def _calculate_model_score(self,
model: PremiumModelEntry,
task_complexity: TaskComplexity,
performance_priority: float,
speed_priority: float,
cost_priority: float) -> float:
"""Calculate weighted score for model selection."""
# Performance score (0-1)
performance_ratings = {
ModelPerformanceRating.EXCELLENT: 1.0,
ModelPerformanceRating.GOOD: 0.8,
ModelPerformanceRating.MODERATE: 0.6,
ModelPerformanceRating.BASIC: 0.4
}
performance_score = performance_ratings[model.performance_rating]
# Speed score (inverse of response time, normalized)
max_response_time = 10.0 # Normalize against 10 second max
speed_score = max(0, (max_response_time - model.avg_response_time) / max_response_time)
# Cost score (inverse of cost, normalized)
max_cost = 0.25 # Normalize against $0.25 per 1k tokens
cost_score = max(0, (max_cost - model.cost_per_1k_tokens) / max_cost)
# Specialty bonus
specialty_bonus = 0.0
if task_complexity == TaskComplexity.RESEARCH and "research" in model.specialties:
specialty_bonus += 0.1
elif task_complexity == TaskComplexity.COMPLEX and "reasoning" in model.specialties:
specialty_bonus += 0.1
elif task_complexity == TaskComplexity.CREATIVE and "creative" in model.specialties:
specialty_bonus += 0.1
# Historical performance bonus
history_bonus = 0.0
if self.enable_adaptive_selection and model.model_id in self.model_performance_history:
history = self.model_performance_history[model.model_id]
if history.get("success_rate", 0.5) > 0.9:
history_bonus += 0.05
if history.get("avg_quality", 0.5) > 0.8:
history_bonus += 0.05
# Calculate weighted score
total_score = (
performance_score * performance_priority +
speed_score * speed_priority +
cost_score * cost_priority +
specialty_bonus +
history_bonus
)
return total_score
def _track_selection(self,
agent_type: str,
task_complexity: TaskComplexity,
selected_model: PremiumModelEntry,
score: float):
"""Track model selection for adaptive learning."""
selection_record = {
"timestamp": datetime.now().isoformat(),
"agent_type": agent_type,
"task_complexity": task_complexity.value,
"model_id": selected_model.model_id,
"model_tier": selected_model.tier.value,
"selection_score": score,
"estimated_cost": (selected_model.max_tokens / 1000) * selected_model.cost_per_1k_tokens
}
self.selection_history.append(selection_record)
# Keep only last 1000 selections
if len(self.selection_history) > 1000:
self.selection_history = self.selection_history[-1000:]
def update_model_performance(self,
model_id: str,
response_time: float,
quality_score: float,
success: bool,
actual_cost: float):
"""Update model performance metrics for adaptive selection."""
if not self.enable_performance_tracking:
return
if model_id not in self.model_performance_history:
self.model_performance_history[model_id] = {
"total_requests": 0,
"successful_requests": 0,
"avg_response_time": 0.0,
"avg_quality": 0.0,
"total_cost": 0.0,
"last_updated": datetime.now()
}
history = self.model_performance_history[model_id]
# Update counters
history["total_requests"] += 1
if success:
history["successful_requests"] += 1
# Update running averages
n = history["total_requests"]
history["avg_response_time"] = ((history["avg_response_time"] * (n - 1)) + response_time) / n
history["avg_quality"] = ((history["avg_quality"] * (n - 1)) + quality_score) / n
history["total_cost"] += actual_cost
history["success_rate"] = history["successful_requests"] / history["total_requests"]
history["last_updated"] = datetime.now()
def get_model_recommendations(self,
agent_types: List[str],
task_complexity: TaskComplexity,
total_budget: float) -> Dict[str, PremiumModelEntry]:
"""Get model recommendations for multiple agent types within budget."""
recommendations = {}
remaining_budget = total_budget
# Sort agent types by importance (synthesis gets premium models first)
importance_order = ["synthesis", "analysis", "critic", "research", "general"]
sorted_agent_types = sorted(agent_types,
key=lambda x: importance_order.index(x) if x in importance_order else 999)
for agent_type in sorted_agent_types:
budget_per_agent = remaining_budget / max(1, len(sorted_agent_types))
selected_model = self.select_optimal_model(
agent_type=agent_type,
task_complexity=task_complexity,
budget_constraint=budget_per_agent,
performance_priority=0.6 if agent_type in ["synthesis", "analysis"] else 0.4,
speed_priority=0.2 if agent_type in ["synthesis", "analysis"] else 0.4,
cost_priority=0.2
)
recommendations[agent_type] = selected_model
estimated_cost = (selected_model.max_tokens / 1000) * selected_model.cost_per_1k_tokens
remaining_budget -= estimated_cost
sorted_agent_types.remove(agent_type)
return recommendations
def get_fallback_model(self, primary_model_id: str) -> Optional[PremiumModelEntry]:
"""Get fallback model for failed primary model."""
for model in self.PREMIUM_MODEL_CATALOG.values():
if model.model_id == primary_model_id and model.fallback_models:
fallback_id = model.fallback_models[0]
for fallback_model in self.PREMIUM_MODEL_CATALOG.values():
if fallback_model.model_id == fallback_id:
return fallback_model
# Default fallback to edge model
return self.PREMIUM_MODEL_CATALOG["mobile-llm-950m"]
def get_analytics_dashboard(self) -> Dict[str, Any]:
"""Get comprehensive analytics dashboard data."""
if not self.selection_history:
return {"message": "No selection history available"}
# Model usage statistics
model_usage = {}
for selection in self.selection_history:
model_id = selection["model_id"]
if model_id not in model_usage:
model_usage[model_id] = {"count": 0, "total_cost": 0.0}
model_usage[model_id]["count"] += 1
model_usage[model_id]["total_cost"] += selection["estimated_cost"]
# Agent type preferences
agent_preferences = {}
for selection in self.selection_history:
agent_type = selection["agent_type"]
if agent_type not in agent_preferences:
agent_preferences[agent_type] = {}
tier = selection["model_tier"]
agent_preferences[agent_type][tier] = agent_preferences[agent_type].get(tier, 0) + 1
# Performance trends
performance_trends = {}
for model_id, history in self.model_performance_history.items():
performance_trends[model_id] = {
"success_rate": history.get("success_rate", 0),
"avg_response_time": history.get("avg_response_time", 0),
"avg_quality": history.get("avg_quality", 0),
"total_requests": history.get("total_requests", 0),
"cost_efficiency": history.get("total_cost", 0) / max(1, history.get("total_requests", 1))
}
return {
"model_usage": model_usage,
"agent_preferences": agent_preferences,
"performance_trends": performance_trends,
"total_selections": len(self.selection_history),
"total_models_used": len(set(s["model_id"] for s in self.selection_history)),
"avg_selection_score": np.mean([s["selection_score"] for s in self.selection_history]),
"cost_distribution": {
tier.value: sum(s["estimated_cost"] for s in self.selection_history
if s["model_tier"] == tier.value)
for tier in ModelTier
}
}
# Factory function for easy integration
def create_premium_model_manager(hf_pro_optimizer: Optional[HFProOptimizer] = None) -> PremiumModelManager:
"""
Create premium model manager with recommended settings.
Args:
hf_pro_optimizer: Optional HF Pro optimizer instance
Returns:
Configured PremiumModelManager instance
"""
return PremiumModelManager(
hf_pro_optimizer=hf_pro_optimizer,
enable_adaptive_selection=True,
enable_cost_optimization=True,
enable_performance_tracking=True
)
# Export main classes
__all__ = [
'PremiumModelManager',
'PremiumModelEntry',
'TaskComplexity',
'ModelPerformanceRating',
'create_premium_model_manager'
]