File size: 6,140 Bytes
9024ad9
 
 
2ed2bd7
10a33a6
 
9024ad9
 
 
 
2ed2bd7
9024ad9
64e4f7a
ae3b1b5
d3f36f7
2ed2bd7
9024ad9
 
2aa2b79
9024ad9
7ab470d
 
 
 
 
2ed2bd7
9024ad9
 
29ed661
2ed2bd7
9024ad9
 
2aa2b79
 
2ed2bd7
9024ad9
2aa2b79
 
2ed2bd7
0b6e76d
6e48ad3
2ed2bd7
 
 
 
 
 
 
 
 
0b6e76d
 
 
2ed2bd7
0b6e76d
2ed2bd7
 
 
 
1b76b21
 
2ed2bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93c9664
 
29ed661
 
 
93c9664
 
 
 
 
 
 
fe47248
93c9664
fe47248
93c9664
 
29ed661
 
 
 
 
93c9664
 
29ed661
 
 
 
 
93c9664
 
 
 
 
 
7fff563
 
 
 
 
93c9664
2ed2bd7
2aa2b79
 
2ed2bd7
2aa2b79
2ed2bd7
2aa2b79
2ed2bd7
7ab470d
 
 
 
 
 
 
 
9024ad9
 
 
29ed661
9024ad9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Configuration management for the text summarizer backend.
"""

from pydantic import Field, validator
from pydantic_settings import BaseSettings


class Settings(BaseSettings):
    """Application settings loaded from environment variables."""

    # Ollama Configuration
    ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
    ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
    ollama_timeout: int = Field(default=60, env="OLLAMA_TIMEOUT", ge=1)

    # Server Configuration
    server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")
    server_port: int = Field(default=8000, env="SERVER_PORT", ge=1, le=65535)
    log_level: str = Field(default="INFO", env="LOG_LEVEL")
    log_format: str = Field(
        default="auto",
        env="LOG_FORMAT",
        description="Log format: 'json' for structured logs, 'text' for colored output, 'auto' for environment-based selection",
    )

    # Optional: API Security
    api_key_enabled: bool = Field(default=False, env="API_KEY_ENABLED")
    api_key: str | None = Field(default=None, env="API_KEY")

    # Optional: Rate Limiting
    rate_limit_enabled: bool = Field(default=False, env="RATE_LIMIT_ENABLED")
    rate_limit_requests: int = Field(default=60, env="RATE_LIMIT_REQUESTS", ge=1)
    rate_limit_window: int = Field(default=60, env="RATE_LIMIT_WINDOW", ge=1)

    # Input validation
    max_text_length: int = Field(default=32000, env="MAX_TEXT_LENGTH", ge=1)  # ~32KB
    max_tokens_default: int = Field(default=256, env="MAX_TOKENS_DEFAULT", ge=1)

    # V2 HuggingFace Configuration
    hf_model_id: str = Field(default="sshleifer/distilbart-cnn-6-6", env="HF_MODEL_ID")
    hf_device_map: str = Field(
        default="auto", env="HF_DEVICE_MAP"
    )  # "auto" for GPU fallback to CPU
    hf_torch_dtype: str = Field(
        default="auto", env="HF_TORCH_DTYPE"
    )  # "auto" for automatic dtype selection
    hf_cache_dir: str = Field(
        default="/tmp/huggingface", env="HF_HOME"
    )  # HuggingFace cache directory
    hf_max_new_tokens: int = Field(default=128, env="HF_MAX_NEW_TOKENS", ge=1, le=2048)
    hf_temperature: float = Field(default=0.7, env="HF_TEMPERATURE", ge=0.0, le=2.0)
    hf_top_p: float = Field(default=0.95, env="HF_TOP_P", ge=0.0, le=1.0)

    # V1/V2 Warmup Control
    enable_v1_warmup: bool = Field(
        default=False, env="ENABLE_V1_WARMUP"
    )  # Disable V1 warmup by default
    enable_v2_warmup: bool = Field(
        default=False, env="ENABLE_V2_WARMUP"
    )  # Disable V2 warmup to save memory for V4

    # V3 Web Scraping Configuration
    enable_v3_scraping: bool = Field(
        default=True, env="ENABLE_V3_SCRAPING", description="Enable V3 web scraping API"
    )
    scraping_timeout: int = Field(
        default=10,
        env="SCRAPING_TIMEOUT",
        ge=1,
        le=60,
        description="HTTP timeout for scraping requests (seconds)",
    )
    scraping_max_text_length: int = Field(
        default=50000,
        env="SCRAPING_MAX_TEXT_LENGTH",
        description="Maximum text length to extract (chars)",
    )
    scraping_cache_enabled: bool = Field(
        default=True,
        env="SCRAPING_CACHE_ENABLED",
        description="Enable in-memory caching of scraped content",
    )
    scraping_cache_ttl: int = Field(
        default=3600,
        env="SCRAPING_CACHE_TTL",
        description="Cache TTL in seconds (default: 1 hour)",
    )
    scraping_user_agent_rotation: bool = Field(
        default=True,
        env="SCRAPING_UA_ROTATION",
        description="Enable user-agent rotation",
    )
    scraping_rate_limit_per_minute: int = Field(
        default=10,
        env="SCRAPING_RATE_LIMIT_PER_MINUTE",
        ge=1,
        le=100,
        description="Max scraping requests per minute per IP",
    )

    # V4 Structured Output Configuration
    enable_v4_structured: bool = Field(
        default=True,
        env="ENABLE_V4_STRUCTURED",
        description="Enable V4 structured summarization API",
    )
    enable_v4_warmup: bool = Field(
        default=False,
        env="ENABLE_V4_WARMUP",
        description="Enable V4 model warmup on startup (uses 1-2GB RAM with quantization)",
    )
    v4_model_id: str = Field(
        default="Qwen/Qwen2.5-1.5B-Instruct",
        env="V4_MODEL_ID",
        description="Model ID for V4 structured output (1.5B params, fits HF 16GB limit)",
    )
    v4_max_tokens: int = Field(
        default=256,
        env="V4_MAX_TOKENS",
        ge=128,
        le=2048,
        description="Max tokens for V4 generation",
    )
    v4_temperature: float = Field(
        default=0.2,
        env="V4_TEMPERATURE",
        ge=0.0,
        le=2.0,
        description="Temperature for V4 (low for stable JSON)",
    )
    v4_enable_quantization: bool = Field(
        default=True,
        env="V4_ENABLE_QUANTIZATION",
        description="Enable INT8 quantization for V4 model (reduces memory from ~2GB to ~1GB). Quantization takes ~1-2 minutes on startup.",
    )
    v4_use_fp16_for_speed: bool = Field(
        default=False,
        env="V4_USE_FP16_FOR_SPEED",
        description="Use FP16 instead of 4-bit quantization for 2-3x faster inference (uses ~2-3GB GPU memory instead of ~1GB)",
    )

    @validator("log_level")
    def validate_log_level(cls, v):
        """Validate log level is one of the standard levels."""
        valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
        if v.upper() not in valid_levels:
            return "INFO"  # Default to INFO for invalid levels
        return v.upper()

    @validator("log_format")
    def validate_log_format(cls, v):
        """Validate log format is one of the supported formats."""
        valid_formats = ["auto", "json", "text"]
        if v.lower() not in valid_formats:
            return "auto"  # Default to auto for invalid formats
        return v.lower()

    class Config:
        env_file = ".env"
        case_sensitive = False
        extra = "ignore"  # Ignore extra fields from environment (e.g., old v4_phi_* variables)


# Global settings instance
settings = Settings()