Spaces:
Sleeping
Sleeping
| """Configuration and environment handling for ZeroGPU Space.""" | |
| import os | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| logger = logging.getLogger(__name__) | |
| class Config: | |
| """Application configuration loaded from environment.""" | |
| # HuggingFace token for gated models | |
| hf_token: Optional[str] = field(default_factory=lambda: os.getenv("HF_TOKEN")) | |
| # Fallback to HF Serverless when ZeroGPU quota exhausted | |
| fallback_enabled: bool = field( | |
| default_factory=lambda: os.getenv("FALLBACK_ENABLED", "true").lower() == "true" | |
| ) | |
| # Logging level | |
| log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO")) | |
| # Quantization settings | |
| default_quantization: str = field( | |
| default_factory=lambda: os.getenv("DEFAULT_QUANTIZATION", "none") | |
| ) | |
| auto_quantize_threshold_b: int = field( | |
| default_factory=lambda: int(os.getenv("AUTO_QUANTIZE_THRESHOLD_B", "34")) | |
| ) | |
| def __post_init__(self): | |
| """Configure logging after initialization.""" | |
| logging.basicConfig( | |
| level=getattr(logging, self.log_level.upper(), logging.INFO), | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| ) | |
| class QuotaTracker: | |
| """Track ZeroGPU quota usage for the current session.""" | |
| # Total seconds used in current day | |
| seconds_used: float = 0.0 | |
| # Daily quota in seconds (PRO plan: 25 min = 1500 sec) | |
| daily_quota_seconds: float = 1500.0 | |
| # Whether quota is exhausted | |
| quota_exhausted: bool = False | |
| def add_usage(self, seconds: float) -> None: | |
| """Record GPU usage time.""" | |
| self.seconds_used += seconds | |
| if self.seconds_used >= self.daily_quota_seconds: | |
| self.quota_exhausted = True | |
| logger.warning( | |
| f"ZeroGPU quota exhausted: {self.seconds_used:.1f}s / {self.daily_quota_seconds:.1f}s" | |
| ) | |
| def remaining_seconds(self) -> float: | |
| """Get remaining quota in seconds.""" | |
| return max(0, self.daily_quota_seconds - self.seconds_used) | |
| def remaining_minutes(self) -> float: | |
| """Get remaining quota in minutes.""" | |
| return self.remaining_seconds() / 60.0 | |
| def reset(self) -> None: | |
| """Reset quota (called at day boundary).""" | |
| self.seconds_used = 0.0 | |
| self.quota_exhausted = False | |
| logger.info("ZeroGPU quota reset") | |
| # Global configuration instance | |
| config = Config() | |
| # Global quota tracker | |
| quota_tracker = QuotaTracker() | |
| def get_config() -> Config: | |
| """Get the global configuration instance.""" | |
| return config | |
| def get_quota_tracker() -> QuotaTracker: | |
| """Get the global quota tracker instance.""" | |
| return quota_tracker | |
| # Model size estimates (parameters in billions) | |
| MODEL_SIZE_ESTIMATES = { | |
| # Llama family | |
| "meta-llama/Llama-3.1-8B-Instruct": 8, | |
| "meta-llama/Llama-3.1-70B-Instruct": 70, | |
| "meta-llama/Llama-3.2-1B-Instruct": 1, | |
| "meta-llama/Llama-3.2-3B-Instruct": 3, | |
| # Mistral family | |
| "mistralai/Mistral-7B-Instruct-v0.3": 7, | |
| "mistralai/Mixtral-8x7B-Instruct-v0.1": 47, # MoE effective | |
| # Qwen family | |
| "Qwen/Qwen2.5-7B-Instruct": 7, | |
| "Qwen/Qwen2.5-14B-Instruct": 14, | |
| "Qwen/Qwen2.5-32B-Instruct": 32, | |
| "Qwen/Qwen2.5-72B-Instruct": 72, | |
| } | |
| def estimate_model_size(model_id: str) -> Optional[int]: | |
| """ | |
| Estimate model size in billions of parameters from model ID. | |
| Returns None if size cannot be determined. | |
| """ | |
| if model_id is None: | |
| return None | |
| # Check known models first | |
| if model_id in MODEL_SIZE_ESTIMATES: | |
| return MODEL_SIZE_ESTIMATES[model_id] | |
| # Try to extract size from model name (e.g., "7B", "70B", "14B") | |
| import re | |
| match = re.search(r"(\d+)B", model_id, re.IGNORECASE) | |
| if match: | |
| return int(match.group(1)) | |
| return None | |
| def should_quantize(model_id: str) -> str: | |
| """ | |
| Determine if a model should be quantized and which method to use. | |
| Returns: "none", "int8", or "int4" | |
| """ | |
| if model_id is None: | |
| return "none" | |
| if config.default_quantization != "none": | |
| return config.default_quantization | |
| size = estimate_model_size(model_id) | |
| if size is None: | |
| # Unknown size, don't auto-quantize | |
| return "none" | |
| if size > 65: | |
| # 70B+ models need INT4 to fit in 70GB VRAM | |
| return "int4" | |
| elif size > config.auto_quantize_threshold_b: | |
| # Large models get INT8 | |
| return "int8" | |
| return "none" | |