"""Configuration and environment handling for ZeroGPU Space.""" import os import logging from dataclasses import dataclass, field from typing import Optional from dotenv import load_dotenv load_dotenv() logger = logging.getLogger(__name__) @dataclass class Config: """Application configuration loaded from environment.""" # HuggingFace token for gated models hf_token: Optional[str] = field(default_factory=lambda: os.getenv("HF_TOKEN")) # Fallback to HF Serverless when ZeroGPU quota exhausted fallback_enabled: bool = field( default_factory=lambda: os.getenv("FALLBACK_ENABLED", "true").lower() == "true" ) # Logging level log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO")) # Quantization settings default_quantization: str = field( default_factory=lambda: os.getenv("DEFAULT_QUANTIZATION", "none") ) auto_quantize_threshold_b: int = field( default_factory=lambda: int(os.getenv("AUTO_QUANTIZE_THRESHOLD_B", "34")) ) def __post_init__(self): """Configure logging after initialization.""" logging.basicConfig( level=getattr(logging, self.log_level.upper(), logging.INFO), format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) @dataclass class QuotaTracker: """Track ZeroGPU quota usage for the current session.""" # Total seconds used in current day seconds_used: float = 0.0 # Daily quota in seconds (PRO plan: 25 min = 1500 sec) daily_quota_seconds: float = 1500.0 # Whether quota is exhausted quota_exhausted: bool = False def add_usage(self, seconds: float) -> None: """Record GPU usage time.""" self.seconds_used += seconds if self.seconds_used >= self.daily_quota_seconds: self.quota_exhausted = True logger.warning( f"ZeroGPU quota exhausted: {self.seconds_used:.1f}s / {self.daily_quota_seconds:.1f}s" ) def remaining_seconds(self) -> float: """Get remaining quota in seconds.""" return max(0, self.daily_quota_seconds - self.seconds_used) def remaining_minutes(self) -> float: """Get remaining quota in minutes.""" return self.remaining_seconds() / 60.0 def reset(self) -> None: """Reset quota (called at day boundary).""" self.seconds_used = 0.0 self.quota_exhausted = False logger.info("ZeroGPU quota reset") # Global configuration instance config = Config() # Global quota tracker quota_tracker = QuotaTracker() def get_config() -> Config: """Get the global configuration instance.""" return config def get_quota_tracker() -> QuotaTracker: """Get the global quota tracker instance.""" return quota_tracker # Model size estimates (parameters in billions) MODEL_SIZE_ESTIMATES = { # Llama family "meta-llama/Llama-3.1-8B-Instruct": 8, "meta-llama/Llama-3.1-70B-Instruct": 70, "meta-llama/Llama-3.2-1B-Instruct": 1, "meta-llama/Llama-3.2-3B-Instruct": 3, # Mistral family "mistralai/Mistral-7B-Instruct-v0.3": 7, "mistralai/Mixtral-8x7B-Instruct-v0.1": 47, # MoE effective # Qwen family "Qwen/Qwen2.5-7B-Instruct": 7, "Qwen/Qwen2.5-14B-Instruct": 14, "Qwen/Qwen2.5-32B-Instruct": 32, "Qwen/Qwen2.5-72B-Instruct": 72, } def estimate_model_size(model_id: str) -> Optional[int]: """ Estimate model size in billions of parameters from model ID. Returns None if size cannot be determined. """ if model_id is None: return None # Check known models first if model_id in MODEL_SIZE_ESTIMATES: return MODEL_SIZE_ESTIMATES[model_id] # Try to extract size from model name (e.g., "7B", "70B", "14B") import re match = re.search(r"(\d+)B", model_id, re.IGNORECASE) if match: return int(match.group(1)) return None def should_quantize(model_id: str) -> str: """ Determine if a model should be quantized and which method to use. Returns: "none", "int8", or "int4" """ if model_id is None: return "none" if config.default_quantization != "none": return config.default_quantization size = estimate_model_size(model_id) if size is None: # Unknown size, don't auto-quantize return "none" if size > 65: # 70B+ models need INT4 to fit in 70GB VRAM return "int4" elif size > config.auto_quantize_threshold_b: # Large models get INT8 return "int8" return "none"