File size: 4,591 Bytes
adcb9bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ea35f6
 
 
adcb9bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ea35f6
 
 
adcb9bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""Configuration and environment handling for ZeroGPU Space."""

import os
import logging
from dataclasses import dataclass, field
from typing import Optional
from dotenv import load_dotenv

load_dotenv()

logger = logging.getLogger(__name__)


@dataclass
class Config:
    """Application configuration loaded from environment."""

    # HuggingFace token for gated models
    hf_token: Optional[str] = field(default_factory=lambda: os.getenv("HF_TOKEN"))

    # Fallback to HF Serverless when ZeroGPU quota exhausted
    fallback_enabled: bool = field(
        default_factory=lambda: os.getenv("FALLBACK_ENABLED", "true").lower() == "true"
    )

    # Logging level
    log_level: str = field(default_factory=lambda: os.getenv("LOG_LEVEL", "INFO"))

    # Quantization settings
    default_quantization: str = field(
        default_factory=lambda: os.getenv("DEFAULT_QUANTIZATION", "none")
    )
    auto_quantize_threshold_b: int = field(
        default_factory=lambda: int(os.getenv("AUTO_QUANTIZE_THRESHOLD_B", "34"))
    )

    def __post_init__(self):
        """Configure logging after initialization."""
        logging.basicConfig(
            level=getattr(logging, self.log_level.upper(), logging.INFO),
            format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
        )


@dataclass
class QuotaTracker:
    """Track ZeroGPU quota usage for the current session."""

    # Total seconds used in current day
    seconds_used: float = 0.0

    # Daily quota in seconds (PRO plan: 25 min = 1500 sec)
    daily_quota_seconds: float = 1500.0

    # Whether quota is exhausted
    quota_exhausted: bool = False

    def add_usage(self, seconds: float) -> None:
        """Record GPU usage time."""
        self.seconds_used += seconds
        if self.seconds_used >= self.daily_quota_seconds:
            self.quota_exhausted = True
            logger.warning(
                f"ZeroGPU quota exhausted: {self.seconds_used:.1f}s / {self.daily_quota_seconds:.1f}s"
            )

    def remaining_seconds(self) -> float:
        """Get remaining quota in seconds."""
        return max(0, self.daily_quota_seconds - self.seconds_used)

    def remaining_minutes(self) -> float:
        """Get remaining quota in minutes."""
        return self.remaining_seconds() / 60.0

    def reset(self) -> None:
        """Reset quota (called at day boundary)."""
        self.seconds_used = 0.0
        self.quota_exhausted = False
        logger.info("ZeroGPU quota reset")


# Global configuration instance
config = Config()

# Global quota tracker
quota_tracker = QuotaTracker()


def get_config() -> Config:
    """Get the global configuration instance."""
    return config


def get_quota_tracker() -> QuotaTracker:
    """Get the global quota tracker instance."""
    return quota_tracker


# Model size estimates (parameters in billions)
MODEL_SIZE_ESTIMATES = {
    # Llama family
    "meta-llama/Llama-3.1-8B-Instruct": 8,
    "meta-llama/Llama-3.1-70B-Instruct": 70,
    "meta-llama/Llama-3.2-1B-Instruct": 1,
    "meta-llama/Llama-3.2-3B-Instruct": 3,

    # Mistral family
    "mistralai/Mistral-7B-Instruct-v0.3": 7,
    "mistralai/Mixtral-8x7B-Instruct-v0.1": 47,  # MoE effective

    # Qwen family
    "Qwen/Qwen2.5-7B-Instruct": 7,
    "Qwen/Qwen2.5-14B-Instruct": 14,
    "Qwen/Qwen2.5-32B-Instruct": 32,
    "Qwen/Qwen2.5-72B-Instruct": 72,
}


def estimate_model_size(model_id: str) -> Optional[int]:
    """
    Estimate model size in billions of parameters from model ID.

    Returns None if size cannot be determined.
    """
    if model_id is None:
        return None

    # Check known models first
    if model_id in MODEL_SIZE_ESTIMATES:
        return MODEL_SIZE_ESTIMATES[model_id]

    # Try to extract size from model name (e.g., "7B", "70B", "14B")
    import re
    match = re.search(r"(\d+)B", model_id, re.IGNORECASE)
    if match:
        return int(match.group(1))

    return None


def should_quantize(model_id: str) -> str:
    """
    Determine if a model should be quantized and which method to use.

    Returns: "none", "int8", or "int4"
    """
    if model_id is None:
        return "none"

    if config.default_quantization != "none":
        return config.default_quantization

    size = estimate_model_size(model_id)
    if size is None:
        # Unknown size, don't auto-quantize
        return "none"

    if size > 65:
        # 70B+ models need INT4 to fit in 70GB VRAM
        return "int4"
    elif size > config.auto_quantize_threshold_b:
        # Large models get INT8
        return "int8"

    return "none"