File size: 14,155 Bytes
2c3c5f5
320e29a
 
2c3c5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320e29a
 
 
 
 
 
 
 
 
 
 
2c3c5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320e29a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3c5f5
 
 
 
 
320e29a
 
2c3c5f5
 
320e29a
 
2c3c5f5
 
 
 
 
 
 
 
 
320e29a
 
 
 
2c3c5f5
320e29a
 
 
 
 
2c3c5f5
320e29a
2c3c5f5
 
320e29a
 
2c3c5f5
320e29a
 
2c3c5f5
 
 
 
 
 
 
320e29a
2c3c5f5
320e29a
2c3c5f5
320e29a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3c5f5
 
 
 
 
320e29a
2c3c5f5
 
 
 
320e29a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3c5f5
320e29a
2c3c5f5
320e29a
2c3c5f5
320e29a
 
 
 
 
 
 
2c3c5f5
 
320e29a
 
 
 
 
 
 
 
 
 
 
 
2c3c5f5
320e29a
 
 
 
2c3c5f5
 
 
 
 
 
320e29a
 
2c3c5f5
320e29a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3c5f5
320e29a
2c3c5f5
 
 
 
 
 
320e29a
2c3c5f5
320e29a
2c3c5f5
 
320e29a
2c3c5f5
 
320e29a
 
 
 
 
 
 
2c3c5f5
 
 
320e29a
2c3c5f5
 
 
 
 
320e29a
 
 
 
 
 
2c3c5f5
 
320e29a
2c3c5f5
320e29a
 
2c3c5f5
 
 
 
 
 
320e29a
 
2c3c5f5
 
 
 
320e29a
 
 
 
 
2c3c5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
320e29a
 
 
 
2c3c5f5
 
320e29a
 
 
2c3c5f5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""
Explainer β€” generates structured analyst reports using Groq API
(llama-3.3-70b-versatile, free tier: 14,400 req/day).
Falls back to a deterministic template if Groq is unavailable.

Reports are cached to data/reports/report_{date}.json.

Usage:
    python model/explainer.py --symbol ZW=F
    python model/explainer.py --all
"""

import json
import logging
import os
import sys
from datetime import date
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))
from model.predictor import predict, predict_all, SYMBOL_NAMES

log = logging.getLogger(__name__)

REPORTS_DIR = Path(__file__).parent.parent / "data" / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

GROQ_MODEL = "llama-3.3-70b-versatile"

# Walk-forward backtest accuracy per symbol (from 3.5yr backtest, Jun 2026)
BACKTEST_ACCURACY = {
    "CL=F": 37.1, "NG=F": 39.7, "GC=F": 51.9, "ZW=F": 37.3,
    "ZC=F": 48.8, "ZS=F": 47.3, "CT=F": 47.4, "SB=F": 37.4,
    "USDINR=X": 59.7, "HG=F": 43.7,
}

HIGH_CONF_ACCURACY = {
    "CL=F": 66.7, "NG=F": 82.3, "ZW=F": 56.9, "ZC=F": 100.0,
    "ZS=F": 75.0, "CT=F": 100.0, "SB=F": 58.4, "USDINR=X": 50.0,
}

_groq_client = None


def _get_groq_client():
    global _groq_client
    if _groq_client is not None:
        return _groq_client
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        return None
    try:
        from groq import Groq
        _groq_client = Groq(api_key=api_key)
        return _groq_client
    except ImportError:
        return None


def _load_macro_context() -> dict:
    """Pull latest FRED macro row for context injection into the prompt."""
    try:
        from data.db import get_conn
        conn = get_conn()
        row = conn.execute(
            "SELECT date, dxy, vix, treasury_10y, fedfunds, indpro "
            "FROM fred_data WHERE dxy IS NOT NULL ORDER BY date DESC LIMIT 1"
        ).fetchone()
        conn.close()
        if row:
            return {
                "date": str(row[0]), "dxy": row[1], "vix": row[2],
                "t10y": row[3], "fedfunds": row[4], "indpro": row[5],
            }
    except Exception:
        pass
    return {}


def _load_cot_context(symbol: str) -> dict:
    """Pull latest COT row for a symbol."""
    try:
        from data.db import get_conn
        conn = get_conn()
        row = conn.execute(
            "SELECT commercial_net_pct, mm_net_pct, commercial_chg_1w, mm_chg_1w "
            "FROM cot_data WHERE symbol = ? ORDER BY date DESC LIMIT 1",
            [symbol]
        ).fetchone()
        conn.close()
        if row:
            return {
                "commercial_net_pct": row[0], "mm_net_pct": row[1],
                "commercial_chg_1w": row[2], "mm_chg_1w": row[3],
            }
    except Exception:
        pass
    return {}


def _load_eia_context(symbol: str) -> dict:
    """Pull latest EIA inventory for CL=F / NG=F."""
    if symbol not in ("CL=F", "NG=F"):
        return {}
    try:
        from data.db import get_conn
        series = "crude_stocks" if symbol == "CL=F" else "natgas_stocks"
        conn = get_conn()
        row = conn.execute(
            "SELECT value, chg_1w, vs_5yr_avg FROM eia_inventory "
            "WHERE series = ? ORDER BY date DESC LIMIT 1",
            [series]
        ).fetchone()
        conn.close()
        if row:
            return {"value": row[0], "chg_1w": row[1], "vs_5yr_avg": row[2]}
    except Exception:
        pass
    return {}


def _format_signals(signals: list[dict]) -> str:
    lines = []
    for i, sig in enumerate(signals[:5], 1):
        label  = sig.get("label", sig.get("feature", "unknown"))
        value  = sig.get("value", 0)
        impact = sig.get("impact", "NEUTRAL")
        weight = sig.get("weight", 0)
        lines.append(f"  {i}. {label}: {value:.3g} ({impact}, weight {weight:.3f})")
    return "\n".join(lines) if lines else "  (no signal data)"


def _pick_risk_factor(prediction: dict) -> str:
    signals = prediction.get("top_signals", [])
    bearish = [s for s in signals if s.get("impact") == "BEARISH"]
    if bearish:
        return bearish[0].get("label", "adverse signal reversal")
    symbol = prediction.get("symbol", "")
    risk_map = {
        "CL=F":    "unexpected OPEC output increase or demand shock",
        "NG=F":    "warmer-than-expected seasonal forecasts cutting demand",
        "GC=F":    "stronger US jobs data reducing Fed cut expectations",
        "ZW=F":    "favourable Black Sea weather easing supply concerns",
        "ZC=F":    "USDA upward crop estimate revision",
        "ZS=F":    "Brazil harvest exceeding expectations",
        "CT=F":    "recovery in monsoon rainfall improving crop outlook",
        "SB=F":    "Brazil supply-side recovery above estimates",
        "USDINR=X":"RBI unexpected rate cut or foreign inflow surge",
        "HG=F":    "China industrial demand data disappointing",
    }
    return risk_map.get(symbol, "unexpected macro policy reversal")


def _template_report(prediction: dict) -> dict:
    """Structured template report β€” used when Groq is unavailable."""
    name     = prediction.get("commodity_name", prediction.get("symbol", "Commodity"))
    symbol   = prediction.get("symbol", "")
    price    = prediction.get("current_price", 0)
    fc7      = prediction.get("forecast_7d", {})
    fc30     = prediction.get("forecast_30d", {})
    direction= fc7.get("direction", "STABLE")
    prob     = fc7.get("probability", 0.5)
    conf     = fc7.get("confidence", "LOW")
    dir30    = fc30.get("direction", "STABLE")
    signals  = prediction.get("top_signals", [])
    accuracy = BACKTEST_ACCURACY.get(symbol, 45.0)

    sig1 = signals[0] if signals else {}
    sig2 = signals[1] if len(signals) > 1 else {}
    s1   = f"{sig1.get('label','momentum')} ({sig1.get('value',0):.3g})" if sig1 else "price momentum"
    s2   = f"{sig2.get('label','sentiment')} ({sig2.get('value',0):.3g})" if sig2 else "news sentiment"
    risk = _pick_risk_factor(prediction)

    dir_word = {"UP": "rise", "DOWN": "fall", "STABLE": "remain range-bound"}.get(direction, "remain range-bound")
    dir_emoji = {"UP": "β–²", "DOWN": "β–Ό", "STABLE": "β—†"}.get(direction, "β—†")

    cot = _load_cot_context(symbol)
    cot_line = ""
    if cot:
        comm = cot.get("commercial_net_pct", 0) or 0
        mm   = cot.get("mm_net_pct", 0) or 0
        cot_line = f"Institutional positioning: commercial hedgers {comm:+.1%}, managed money {mm:+.1%}."

    trade_bias = {
        "UP":     f"Bias long {name}. Monitor {s1} for continuation.",
        "DOWN":   f"Bias short {name}. Watch for {risk} as an exit trigger.",
        "STABLE": f"Range-bound. Wait for a directional break before committing.",
    }.get(direction, "No clear trade bias.")

    return {
        "outlook":     f"{dir_emoji} {name} is forecast to {dir_word} over the next 7 days β€” {prob:.0%} model probability, {conf} confidence. 30-day view: {dir30}. Model historical accuracy: {accuracy:.1f}% (vs 33.3% random).",
        "key_drivers": f"Primary signals driving this call: {s1} and {s2}. {cot_line}",
        "risk":        f"Main downside risk: {risk} could invalidate this forecast.",
        "trade_idea":  trade_bias,
    }


def _groq_report(prediction: dict) -> dict:
    """Call Groq API to generate a structured 4-section analyst report."""
    client = _get_groq_client()
    if client is None:
        return _template_report(prediction)

    name    = prediction.get("commodity_name", prediction.get("symbol"))
    symbol  = prediction.get("symbol", "")
    price   = prediction.get("current_price", 0)
    fc7     = prediction.get("forecast_7d", {})
    fc30    = prediction.get("forecast_30d", {})
    signals = prediction.get("top_signals", [])
    accuracy = BACKTEST_ACCURACY.get(symbol, 45.0)
    hc_acc   = HIGH_CONF_ACCURACY.get(symbol)
    conf     = fc7.get("confidence", "LOW")

    macro = _load_macro_context()
    cot   = _load_cot_context(symbol)
    eia   = _load_eia_context(symbol)

    macro_block = ""
    if macro:
        macro_block = (
            f"Macro context: DXY={macro.get('dxy',0):.1f}, VIX={macro.get('vix',0):.1f}, "
            f"10Y yield={macro.get('t10y',0):.2f}%, Fed Funds={macro.get('fedfunds',0):.2f}%"
        )

    cot_block = ""
    if cot:
        cot_block = (
            f"COT positioning: commercial hedgers {cot.get('commercial_net_pct',0):+.1%} net long "
            f"(week chg: {cot.get('commercial_chg_1w',0):+,.0f}), "
            f"managed money {cot.get('mm_net_pct',0):+.1%} net long"
        )

    eia_block = ""
    if eia:
        label = "Crude stocks" if symbol == "CL=F" else "Nat gas storage"
        eia_block = (
            f"{label}: {eia.get('value',0):,.0f} (week chg: {eia.get('chg_1w',0):+,.0f}, "
            f"vs 5yr avg: {eia.get('vs_5yr_avg',0):+.1f}%)"
        )

    hc_line = f" When confidence is HIGH, this model is right {hc_acc:.0f}% of the time." if hc_acc and conf == "HIGH" else ""

    prompt = f"""You are a professional commodity market analyst writing a structured report for a trading terminal.

COMMODITY: {name} ({symbol})
CURRENT PRICE: ${price:,.2f}
7-DAY FORECAST: {fc7.get('direction')} | Probability: {fc7.get('probability',0):.0%} | Confidence: {conf}
30-DAY FORECAST: {fc30.get('direction')} | Probability: {fc30.get('probability',0):.0%} | Confidence: {fc30.get('confidence','LOW')}
MODEL ACCURACY: {accuracy:.1f}% historical (random = 33.3%).{hc_line}

TOP SIGNALS (SHAP-ranked):
{_format_signals(signals)}

{macro_block}
{cot_block}
{eia_block}

Write EXACTLY this JSON structure β€” no extra keys, no markdown fences:
{{
  "outlook": "2 sentences. State the directional forecast, probability, confidence tier, and 30-day view. Mention the model accuracy context.",
  "key_drivers": "2 sentences. Name the top 2-3 signals with their actual values. Include COT positioning or EIA inventory if relevant.",
  "risk": "1 sentence. The single most important factor that could invalidate this forecast.",
  "trade_idea": "1-2 sentences. Actionable bias β€” long/short/wait, entry trigger, what to watch."
}}

Rules:
- Use numbers and specific values everywhere possible
- No filler phrases like "based on the analysis" or "it is worth noting"
- Write like a Bloomberg terminal analyst, not a chatbot
- Total word count: 80-120 words across all 4 fields
"""

    try:
        response = client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0.2,
        )
        raw = response.choices[0].message.content.strip()

        # Strip markdown fences if model adds them
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        raw = raw.strip()

        parsed = json.loads(raw)
        # Validate all 4 keys present
        for key in ("outlook", "key_drivers", "risk", "trade_idea"):
            if key not in parsed or not parsed[key]:
                raise ValueError(f"Missing key: {key}")
        return parsed

    except Exception as exc:
        log.warning("Groq report failed (%s) β€” using template", exc)
        return _template_report(prediction)


# ── public API ─────────────────────────────────────────────────────────────────


def generate_report(prediction: dict) -> dict:
    """
    Generate a structured 4-section analyst report for a commodity.

    Returns:
        Dict with keys: outlook, key_drivers, risk, trade_idea
    """
    if "error" in prediction:
        sym = prediction.get("symbol", "Commodity")
        return {
            "outlook":     f"{sym}: forecast unavailable ({prediction['error']}).",
            "key_drivers": "Run the daily pipeline to generate features.",
            "risk":        "No data.",
            "trade_idea":  "No actionable signal.",
        }
    return _groq_report(prediction)


def generate_all_reports(as_of_date: str = None) -> dict[str, dict]:
    today = as_of_date or date.today().isoformat()
    cache_path = REPORTS_DIR / f"report_{today}.json"

    if cache_path.exists():
        with open(cache_path) as f:
            data = json.load(f)
        # If cached as old string format, regenerate
        if data and isinstance(next(iter(data.values())), str):
            cache_path.unlink()
        else:
            return data

    forecasts = predict_all(as_of_date)
    reports: dict[str, dict] = {}
    for symbol, fc in forecasts.items():
        reports[symbol] = generate_report(fc)
        log.info("%s: report generated", SYMBOL_NAMES.get(symbol, symbol))

    with open(cache_path, "w") as f:
        json.dump(reports, f, indent=2)
    return reports


def load_latest_reports() -> dict[str, dict]:
    """Return the most recently generated reports, or empty dict."""
    report_files = sorted(REPORTS_DIR.glob("report_*.json"), reverse=True)
    if not report_files:
        return {}
    with open(report_files[0]) as f:
        data = json.load(f)
    # Migrate old string-format cache
    if data and isinstance(next(iter(data.values())), str):
        return {}
    return data


if __name__ == "__main__":
    import argparse
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

    parser = argparse.ArgumentParser(description="CommodiSense explainer")
    parser.add_argument("--symbol", default=None)
    parser.add_argument("--all",    action="store_true")
    parser.add_argument("--date",   default=None)
    args = parser.parse_args()

    if args.all:
        reports = generate_all_reports(args.date)
        for sym, r in reports.items():
            print(f"\n[{sym}]")
            for k, v in r.items():
                print(f"  {k.upper()}: {v}")
    elif args.symbol:
        fc = predict(args.symbol, args.date)
        r  = generate_report(fc)
        for k, v in r.items():
            print(f"{k.upper()}: {v}")
    else:
        parser.print_help()