Spaces:
Sleeping
Sleeping
File size: 4,591 Bytes
adcb9bd 5ea35f6 adcb9bd 5ea35f6 adcb9bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""Configuration and environment handling for ZeroGPU Space."""
import os
import logging
from dataclasses import dataclass, field
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
logger = logging.getLogger(__name__)
@dataclass
class Config:
"""Application configuration loaded from environment."""
# HuggingFace token for gated models
hf_token: Optional[str] = field(default_factory=lambda: os.getenv("HF_TOKEN"))
# Fallback to HF Serverless when ZeroGPU quota exhausted
fallback_enabled: bool = field(
default_factory=lambda: os.getenv("FALLBACK_ENABLED", "true").lower() == "true"
)
# Logging level
log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO"))
# Quantization settings
default_quantization: str = field(
default_factory=lambda: os.getenv("DEFAULT_QUANTIZATION", "none")
)
auto_quantize_threshold_b: int = field(
default_factory=lambda: int(os.getenv("AUTO_QUANTIZE_THRESHOLD_B", "34"))
)
def __post_init__(self):
"""Configure logging after initialization."""
logging.basicConfig(
level=getattr(logging, self.log_level.upper(), logging.INFO),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
@dataclass
class QuotaTracker:
"""Track ZeroGPU quota usage for the current session."""
# Total seconds used in current day
seconds_used: float = 0.0
# Daily quota in seconds (PRO plan: 25 min = 1500 sec)
daily_quota_seconds: float = 1500.0
# Whether quota is exhausted
quota_exhausted: bool = False
def add_usage(self, seconds: float) -> None:
"""Record GPU usage time."""
self.seconds_used += seconds
if self.seconds_used >= self.daily_quota_seconds:
self.quota_exhausted = True
logger.warning(
f"ZeroGPU quota exhausted: {self.seconds_used:.1f}s / {self.daily_quota_seconds:.1f}s"
)
def remaining_seconds(self) -> float:
"""Get remaining quota in seconds."""
return max(0, self.daily_quota_seconds - self.seconds_used)
def remaining_minutes(self) -> float:
"""Get remaining quota in minutes."""
return self.remaining_seconds() / 60.0
def reset(self) -> None:
"""Reset quota (called at day boundary)."""
self.seconds_used = 0.0
self.quota_exhausted = False
logger.info("ZeroGPU quota reset")
# Global configuration instance
config = Config()
# Global quota tracker
quota_tracker = QuotaTracker()
def get_config() -> Config:
"""Get the global configuration instance."""
return config
def get_quota_tracker() -> QuotaTracker:
"""Get the global quota tracker instance."""
return quota_tracker
# Model size estimates (parameters in billions)
MODEL_SIZE_ESTIMATES = {
# Llama family
"meta-llama/Llama-3.1-8B-Instruct": 8,
"meta-llama/Llama-3.1-70B-Instruct": 70,
"meta-llama/Llama-3.2-1B-Instruct": 1,
"meta-llama/Llama-3.2-3B-Instruct": 3,
# Mistral family
"mistralai/Mistral-7B-Instruct-v0.3": 7,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 47, # MoE effective
# Qwen family
"Qwen/Qwen2.5-7B-Instruct": 7,
"Qwen/Qwen2.5-14B-Instruct": 14,
"Qwen/Qwen2.5-32B-Instruct": 32,
"Qwen/Qwen2.5-72B-Instruct": 72,
}
def estimate_model_size(model_id: str) -> Optional[int]:
"""
Estimate model size in billions of parameters from model ID.
Returns None if size cannot be determined.
"""
if model_id is None:
return None
# Check known models first
if model_id in MODEL_SIZE_ESTIMATES:
return MODEL_SIZE_ESTIMATES[model_id]
# Try to extract size from model name (e.g., "7B", "70B", "14B")
import re
match = re.search(r"(\d+)B", model_id, re.IGNORECASE)
if match:
return int(match.group(1))
return None
def should_quantize(model_id: str) -> str:
"""
Determine if a model should be quantized and which method to use.
Returns: "none", "int8", or "int4"
"""
if model_id is None:
return "none"
if config.default_quantization != "none":
return config.default_quantization
size = estimate_model_size(model_id)
if size is None:
# Unknown size, don't auto-quantize
return "none"
if size > 65:
# 70B+ models need INT4 to fit in 70GB VRAM
return "int4"
elif size > config.auto_quantize_threshold_b:
# Large models get INT8
return "int8"
return "none"
|