opencode-zerogpu / config.py
serenichron's picture
Fix None model_id handling in Gradio examples
5ea35f6
"""Configuration and environment handling for ZeroGPU Space."""
import os
import logging
from dataclasses import dataclass, field
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger(__name__)
@dataclass
class Config:
"""Application configuration loaded from environment."""
# HuggingFace token for gated models
hf_token: Optional[str] = field(default_factory=lambda: os.getenv("HF_TOKEN"))
# Fallback to HF Serverless when ZeroGPU quota exhausted
fallback_enabled: bool = field(
default_factory=lambda: os.getenv("FALLBACK_ENABLED", "true").lower() == "true"
)
# Logging level
log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO"))
# Quantization settings
default_quantization: str = field(
default_factory=lambda: os.getenv("DEFAULT_QUANTIZATION", "none")
)
auto_quantize_threshold_b: int = field(
default_factory=lambda: int(os.getenv("AUTO_QUANTIZE_THRESHOLD_B", "34"))
)
def __post_init__(self):
"""Configure logging after initialization."""
logging.basicConfig(
level=getattr(logging, self.log_level.upper(), logging.INFO),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
@dataclass
class QuotaTracker:
"""Track ZeroGPU quota usage for the current session."""
# Total seconds used in current day
seconds_used: float = 0.0
# Daily quota in seconds (PRO plan: 25 min = 1500 sec)
daily_quota_seconds: float = 1500.0
# Whether quota is exhausted
quota_exhausted: bool = False
def add_usage(self, seconds: float) -> None:
"""Record GPU usage time."""
self.seconds_used += seconds
if self.seconds_used >= self.daily_quota_seconds:
self.quota_exhausted = True
logger.warning(
f"ZeroGPU quota exhausted: {self.seconds_used:.1f}s / {self.daily_quota_seconds:.1f}s"
)
def remaining_seconds(self) -> float:
"""Get remaining quota in seconds."""
return max(0, self.daily_quota_seconds - self.seconds_used)
def remaining_minutes(self) -> float:
"""Get remaining quota in minutes."""
return self.remaining_seconds() / 60.0
def reset(self) -> None:
"""Reset quota (called at day boundary)."""
self.seconds_used = 0.0
self.quota_exhausted = False
logger.info("ZeroGPU quota reset")
# Global configuration instance
config = Config()
# Global quota tracker
quota_tracker = QuotaTracker()
def get_config() -> Config:
"""Get the global configuration instance."""
return config
def get_quota_tracker() -> QuotaTracker:
"""Get the global quota tracker instance."""
return quota_tracker
# Model size estimates (parameters in billions)
MODEL_SIZE_ESTIMATES = {
# Llama family
"meta-llama/Llama-3.1-8B-Instruct": 8,
"meta-llama/Llama-3.1-70B-Instruct": 70,
"meta-llama/Llama-3.2-1B-Instruct": 1,
"meta-llama/Llama-3.2-3B-Instruct": 3,
# Mistral family
"mistralai/Mistral-7B-Instruct-v0.3": 7,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 47, # MoE effective
# Qwen family
"Qwen/Qwen2.5-7B-Instruct": 7,
"Qwen/Qwen2.5-14B-Instruct": 14,
"Qwen/Qwen2.5-32B-Instruct": 32,
"Qwen/Qwen2.5-72B-Instruct": 72,
}
def estimate_model_size(model_id: str) -> Optional[int]:
"""
Estimate model size in billions of parameters from model ID.
Returns None if size cannot be determined.
"""
if model_id is None:
return None
# Check known models first
if model_id in MODEL_SIZE_ESTIMATES:
return MODEL_SIZE_ESTIMATES[model_id]
# Try to extract size from model name (e.g., "7B", "70B", "14B")
import re
match = re.search(r"(\d+)B", model_id, re.IGNORECASE)
if match:
return int(match.group(1))
return None
def should_quantize(model_id: str) -> str:
"""
Determine if a model should be quantized and which method to use.
Returns: "none", "int8", or "int4"
"""
if model_id is None:
return "none"
if config.default_quantization != "none":
return config.default_quantization
size = estimate_model_size(model_id)
if size is None:
# Unknown size, don't auto-quantize
return "none"
if size > 65:
# 70B+ models need INT4 to fit in 70GB VRAM
return "int4"
elif size > config.auto_quantize_threshold_b:
# Large models get INT8
return "int8"
return "none"