File size: 10,819 Bytes
f64b002
 
 
 
 
cd8f75a
 
 
f64b002
cd8f75a
f64b002
 
 
cd8f75a
 
f64b002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd8f75a
 
 
 
8c122be
89ed119
f64b002
 
 
 
 
 
 
 
5e59c7d
 
 
 
aa191f1
 
 
 
f64b002
 
 
 
 
269f729
cd8f75a
269f729
b3b36f7
 
aa0e151
f64b002
b3b36f7
 
 
 
f64b002
8a34489
 
1e6ab4d
aa191f1
3ef755a
b020cb9
3ef755a
b020cb9
 
 
 
aa191f1
3ef755a
b020cb9
1e6ab4d
 
3ef755a
 
 
1e6ab4d
f1e4efb
8a34489
bab1273
 
5e59c7d
 
 
 
 
 
 
 
 
bab1273
cd8f75a
1e6ab4d
aa191f1
4993f5e
8d66265
 
 
b3b36f7
 
 
 
e57e9d1
 
 
 
 
 
 
 
b3b36f7
cd8f75a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f64b002
 
cd8f75a
 
 
 
f64b002
 
 
 
 
 
 
 
1e6ab4d
 
 
 
 
 
 
 
 
 
 
 
 
aa191f1
1e6ab4d
 
 
 
aa191f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e6ab4d
 
 
 
 
 
 
 
 
 
 
aa191f1
1e6ab4d
 
 
 
 
 
 
 
 
 
 
 
f64b002
 
 
 
 
 
cd8f75a
3cb20f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
"""
Configuration management using pydantic-settings.
All settings are loaded from environment variables.
"""

import hashlib
import json
import logging
from functools import lru_cache
from pathlib import Path
from typing import Optional
from pydantic_settings import BaseSettings, SettingsConfigDict

logger = logging.getLogger(__name__)


class Settings(BaseSettings):
    """Application settings loaded from environment variables."""
    
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore"
    )
    
    # Database
    database_url: str = "sqlite:////data/app.db"
    
    # Pipeline lock & model storage
    pipeline_lock_file: str = "/data/pipeline.lock"
    model_dir: str = "/data/models"
    
    # News sources
    newsapi_key: Optional[str] = None
    news_query: str = "copper OR copper price OR copper futures OR copper mining"
    news_language: str = "en"
    
    # Symbol set configuration
    symbol_set: str = "active"  # active | champion | challenger
    
    # Price data (yfinance) - Dashboard symbols (backward compatible)
    yfinance_symbols: str = "HG=F,DX-Y.NYB,CL=F,FXI,COPX,COPJ,BHP,FCX,SCCO,RIO,TECK,LUN.TO,IVN.TO,2899.HK"
    lookback_days: int = 730  # 2 years for better pattern learning
    
    # Fuzzy deduplication
    fuzzy_dedup_threshold: int = 85
    fuzzy_dedup_window_hours: int = 48
    
    # Sentiment aggregation
    sentiment_tau_hours: float = 12.0
    sentiment_missing_fill: float = 0.0
    sentiment_non_neutral_boost: float = 1.35
    sentiment_soft_neutral_polarity_threshold: float = 0.12
    sentiment_soft_neutral_max_mag: float = 0.25
    sentiment_soft_neutral_scale: float = 0.8
    sentiment_relevance_min: float = 0.35
    sentiment_escalate_conflict_threshold: float = 0.55
    sentiment_horizon_days: int = 5
    scoring_source: str = "news_processed"
    
    # API settings
    analysis_ttl_minutes: int = 30
    log_level: str = "INFO"
    
    # Futures vs Spot adjustment factor
    futures_spot_adjustment: float = 0.985
    
    # Scheduler (DEPRECATED in API - external scheduler only)
    # These are kept for backward compatibility but scheduler no longer runs in API
    schedule_time: str = "02:00"
    tz: str = "Europe/Istanbul"
    scheduler_enabled: bool = False  # Default to False - scheduler is external now
    
    # Redis Queue (for worker)
    redis_url: str = "redis://localhost:6379/0"
    
    # OpenRouter AI Commentary
    openrouter_api_key: Optional[str] = None
    # Deprecated - kept for backward compatibility
    openrouter_model: str = "arcee-ai/trinity-large-preview:free"
    # Scoring models:
    #   fast   → stepfun/step-3.5-flash:free  (196B MoE, 256K ctx, system prompt + JSON OK)
    #   reliable → mistralai/mistral-small-3.1-24b-instruct:free (128K ctx, 24B, reliable JSON)
    #   commentary → same as fast for balanced quality/speed
    # NOTE: google/gemma-3-4b-it:free fails on Google AI Studio (system prompt blocked).
    #        google/gemma-3n-e4b-it:free (nano) also blocks system prompts — do NOT use.
    openrouter_model_scoring: str = "stepfun/step-3.5-flash:free"
    openrouter_model_scoring_fast: Optional[str] = None
    openrouter_model_scoring_reliable: Optional[str] = "mistralai/mistral-small-3.1-24b-instruct:free"
    openrouter_model_commentary: str = "stepfun/step-3.5-flash:free"
    openrouter_rpm: int = 18
    openrouter_max_retries: int = 3
    # Free tier: 50 req/day. At 12 articles/chunk, 100 articles = ~9 chunks = ~9-18 req.
    # Keep well under the daily limit to avoid rate-limit cascades mid-run.
    max_llm_articles_per_run: int = 100
    openrouter_fallback_models: Optional[str] = None
    tokenizers_parallelism: str = "false"
    
    # Twelve Data (Live Price)
    twelvedata_api_key: Optional[str] = None

    # Inference sentiment adjustment (aggressive but capped)
    inference_sentiment_multiplier_max: float = 2.0
    inference_sentiment_multiplier_min: float = 0.5
    inference_sentiment_news_ref: int = 30
    inference_sentiment_power_ref: float = 0.20
    inference_tiny_signal_threshold: float = 0.0015
    inference_tiny_signal_floor: float = 0.0025
    inference_return_cap: float = 0.02
    
    # LLM Sentiment Analysis
    # Deprecated - kept for backward compatibility
    llm_sentiment_model: str = "arcee-ai/trinity-large-preview:free"
    
    # Pipeline trigger authentication
    pipeline_trigger_secret: Optional[str] = None
    
    # Faz 2: Market cut-off for news aggregation
    market_timezone: str = "America/New_York"  # NYSE timezone
    market_close_time: str = "16:00"  # 4 PM ET
    cutoff_buffer_minutes: int = 30  # Allow 30 min after close for late news

    # TFT-ASRO Deep Learning
    tft_enabled: bool = True
    tft_embedding_batch_size: int = 64
    tft_pca_dim: int = 32
    tft_embedding_backfill_days: int = 30
    tft_train_on_pipeline: bool = False
    nasdaq_data_link_api_key: Optional[str] = None
    
    def _load_symbol_set_file(self, set_name: str) -> Optional[dict]:
        """Load symbol set from JSON file. Returns None on error."""
        try:
            # Path relative to backend root
            backend_root = Path(__file__).resolve().parent.parent
            symbol_file = backend_root / "config" / "symbol_sets" / f"{set_name}.json"
            
            if not symbol_file.exists():
                logger.warning(f"Symbol set file not found: {symbol_file}")
                return None
            
            with open(symbol_file) as f:
                data = json.load(f)
            
            symbols = data.get("symbols", [])
            if not symbols:
                logger.warning(f"Symbol set {set_name} has empty symbols list")
                return None
            
            return data
            
        except Exception as e:
            logger.error(f"Error loading symbol set {set_name}: {e}")
            return None
    
    def _compute_symbols_hash(self, symbols: list[str]) -> str:
        """Compute deterministic hash of symbol list."""
        canonical = json.dumps(sorted(symbols), sort_keys=True)
        return f"sha256:{hashlib.sha256(canonical.encode()).hexdigest()[:16]}"
    
    @property
    def training_symbols(self) -> list[str]:
        """
        Symbols for ML training - loaded from symbol set file.
        Falls back to dashboard symbols on error.
        """
        data = self._load_symbol_set_file(self.symbol_set)
        if data:
            symbols = data.get("symbols", [])
            logger.info(f"Loaded training symbols from file: {self.symbol_set}.json ({len(symbols)}) hash={self._compute_symbols_hash(symbols)}")
            return symbols
        
        # Fallback to env variable
        logger.warning(f"Falling back to YFINANCE_SYMBOLS for training")
        return self.symbols_list
    
    @property
    def training_symbols_source(self) -> str:
        """Source of training symbols for audit."""
        data = self._load_symbol_set_file(self.symbol_set)
        if data:
            return f"file:{self.symbol_set}.json"
        return "env:YFINANCE_SYMBOLS"
    
    @property
    def training_symbols_hash(self) -> str:
        """Hash of training symbols for audit."""
        return self._compute_symbols_hash(self.training_symbols)
    
    @property
    def symbols_list(self) -> list[str]:
        """
        Dashboard symbols - backward compatible with frontend.
        Always uses env variable (14 symbols).
        """
        return [s.strip() for s in self.yfinance_symbols.split(",") if s.strip()]
    
    @property
    def target_symbol(self) -> str:
        """Primary symbol for predictions (first in list)."""
        symbols = self.symbols_list
        return symbols[0] if symbols else "HG=F"

    @staticmethod
    def _first_non_empty(*values: Optional[str]) -> Optional[str]:
        """Return first non-empty string value."""
        for value in values:
            if value and value.strip():
                return value.strip()
        return None

    @property
    def resolved_scoring_model(self) -> str:
        """Preferred scoring model with backward-compatible fallback chain."""
        return (
            self._first_non_empty(
                self.openrouter_model_scoring_fast,
                self.openrouter_model_scoring,
                self.llm_sentiment_model,
                self.openrouter_model,
            )
            or "arcee-ai/trinity-large-preview:free"
        )

    @property
    def resolved_scoring_fast_model(self) -> str:
        """Fast model used for primary sentiment scoring."""
        return self.resolved_scoring_model

    @property
    def resolved_scoring_reliable_model(self) -> str:
        """Reliable model used for escalation/retry on malformed outputs."""
        return (
            self._first_non_empty(
                self.openrouter_model_scoring_reliable,
                self.openrouter_model,
                self.llm_sentiment_model,
                self.openrouter_model_scoring,
            )
            or "arcee-ai/trinity-large-preview:free"
        )

    @property
    def resolved_commentary_model(self) -> str:
        """Preferred commentary model with backward-compatible fallback chain."""
        return (
            self._first_non_empty(
                self.openrouter_model_commentary,
                self.openrouter_model,
                self.llm_sentiment_model,
            )
            or "arcee-ai/trinity-large-preview:free"
        )

    @property
    def openrouter_fallback_models_list(self) -> list[str]:
        """
        Parse comma-separated fallback models.
        Empty/whitespace items are ignored.
        """
        if not self.openrouter_fallback_models:
            return []
        return [m.strip() for m in self.openrouter_fallback_models.split(",") if m.strip()]


@lru_cache
def get_settings() -> Settings:
    """Get cached settings instance."""
    return Settings()


def mask_api_key(text: str, settings: Settings = None) -> str:
    """
    Mask API keys in text to prevent leaking in logs.
    Replaces known API key patterns with masked versions.
    """
    import re
    
    if settings is None:
        settings = get_settings()
    
    result = text
    
    # Mask known API keys
    keys_to_mask = [
        settings.twelvedata_api_key,
        settings.openrouter_api_key,
        settings.newsapi_key,
        settings.pipeline_trigger_secret,
    ]
    
    for key in keys_to_mask:
        if key and len(key) > 8:
            masked = f"{key[:4]}...{key[-4:]}"
            result = result.replace(key, masked)
    
    # Also mask any apikey= query params
    result = re.sub(r'apikey=[a-zA-Z0-9_-]+', 'apikey=***MASKED***', result)
    
    return result