"""
Oregon Timber Market Timing Model — Final Implementation
=========================================================
Predicts Douglas fir stumpage prices and recommends optimal marketing timing
for a 20-acre Oregon timber tract using Ridge regression with walk-forward validation
and Monte Carlo optimal stopping.

Based on: Faustmann rotation model (1849), Clarke & Reed real-options extension (1989),
and Pacific NW timber economics literature (USDA PNW-GTR-423, PNW-RP-436).
"""
import pandas as pd
import numpy as np
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error
import json, warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
OUT_DIR = '/app'

# ── Load ────────────────────────────────────────────────────────────────────
df = pd.read_csv(f'{OUT_DIR}/oregon_timber_data.csv', parse_dates=['date'])
df = df.set_index('date')
TARGET = 'stumpage_price_mbf'
print(f"Loaded: {len(df)} quarters, {df.index[0].date()} → {df.index[-1].date()}")
print(f"Price: ${df[TARGET].min():.0f}–${df[TARGET].max():.0f}/MBF, μ=${df[TARGET].mean():.0f}\n")

# ── Features ─────────────────────────────────────────────────────────────────
def build_features(df_):
    p = df_[TARGET]
    out = pd.DataFrame({
        'price_lag1':     p.shift(1),
        'price_lag2':     p.shift(2),
        'price_lag4':     p.shift(4),
        'price_roll4':    p.rolling(4).mean().shift(1),
        'price_chg_yoy':  p.pct_change(4).shift(1),
        'lumber_ppi':     df_['lumber_ppi'],
        'lumber_ppi_lag1': df_['lumber_ppi'].shift(1),
        'lumber_chg_yoy':  df_['lumber_ppi'].pct_change(4),
        'lumber_futures':  df_['lumber_futures_mbf'],
        'housing_lag2':   df_['housing_starts'].shift(2),
        'housing_lag4':   df_['housing_starts'].shift(4),
        'housing_chg':    df_['housing_starts'].pct_change(4).shift(2),
        'mortgage_lag1':  df_['mortgage_30yr'].shift(1),
        'mortgage_chg':   (df_['mortgage_30yr'] - df_['mortgage_30yr'].shift(4)).shift(1),
        'cad_lag1':       df_['cad_usd'].shift(1),
        'unemployment':   df_['unemployment_rate'].shift(1),
        'sentiment':      df_['consumer_sentiment'].shift(1),
    }, index=df_.index)
    q = df_.index.quarter
    out['q1'] = (q == 1).astype(float)
    out['q2'] = (q == 2).astype(float)
    out['q3'] = (q == 3).astype(float)
    out['year_norm'] = (df_.index.year - 1993) / 31.0
    out[TARGET] = df_[TARGET]
    return out.dropna()

feat_df = build_features(df)
X_cols = [c for c in feat_df.columns if c != TARGET]
X, y = feat_df[X_cols].values, feat_df[TARGET].values
n = len(feat_df)
print(f"Features: {len(X_cols)} | Training obs: {n}")

# ── Walk-forward CV ─────────────────────────────────────────────────────────
print("\n=== Walk-Forward Validation (2015–2024) ===\n")
windows = [(80,88), (88,96), (96,104), (104,112), (112,n)]
actual, pred = [], []

for tr_end, te_end in windows:
    m = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
    m.fit(X[:tr_end], y[:tr_end])
    yp = m.predict(X[tr_end:te_end])
    r2v = r2_score(y[tr_end:te_end], yp)
    actual.extend(y[tr_end:te_end]); pred.extend(yp)
    print(f"  Train={tr_end}  Test={feat_df.index[tr_end].strftime('%Y')}→{feat_df.index[min(te_end,n)-1].strftime('%Y')} "
          f"({te_end-tr_end}q) | R²={r2v:+.3f}  MAE=${mean_absolute_error(y[tr_end:te_end], yp):.0f}/MBF")

ovr_r2 = r2_score(actual, pred)
ovr_mae = mean_absolute_error(actual, pred)
print(f"\n  Overall R²={ovr_r2:.3f}  MAE=${ovr_mae:.0f}/MBF  (explains {ovr_r2*100:.0f}% of out-of-sample variance)")

# ── Final model ─────────────────────────────────────────────────────────────
model = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
model.fit(X, y)
print(f"\nFinal model (full dataset) R² = {r2_score(y, model.predict(X)):.3f}")

# ── 8-Quarter Forecast ──────────────────────────────────────────────────────
print("\n=== 8-Quarter Forecast ===\n")

def project_series(s, n_ahead, reversion=0.1):
    v = s.iloc[-1]
    m = s.tail(20).mean()
    std = s.tail(20).std()
    out = []
    for _ in range(n_ahead):
        v = v * (1 - reversion) + m * reversion + np.random.normal(0, std * 0.05)
        out.append(v)
    return out

n_ahead = 8
fc_dates = pd.date_range(df.index[-1] + pd.DateOffset(months=3), periods=n_ahead, freq='QS')
n_boot = 500
boot_preds = np.zeros((n_boot, n_ahead))

for b in range(n_boot):
    lum = project_series(df['lumber_ppi'], n_ahead, reversion=0.08)
    luf = project_series(df['lumber_futures_mbf'], n_ahead, reversion=0.08)
    hou = project_series(df['housing_starts'], n_ahead, reversion=0.06)
    mor = project_series(df['mortgage_30yr'], n_ahead, reversion=0.05)
    cad = project_series(df['cad_usd'], n_ahead, reversion=0.05)
    une = project_series(df['unemployment_rate'], n_ahead, reversion=0.05)
    sen = project_series(df['consumer_sentiment'], n_ahead, reversion=0.05)
    
    hp = list(df[TARGET].values)  # historical prices, grows as we forecast
    
    for i in range(n_ahead):
        p1 = hp[-1]; p2 = hp[-2]; p4 = hp[-4] if len(hp)>=4 else hp[-1]
        pr4 = np.mean(hp[-4:])
        pcy = (hp[-1]/hp[-5]-1) if len(hp)>=5 else 0
        lpp = lum[i]; lpl = lum[i-1] if i>0 else df['lumber_ppi'].iloc[-1]
        lcy = (lum[i]/(lum[i-4] if i>=4 else df['lumber_ppi'].iloc[-(4-i)])-1)
        lfu = luf[i]
        hl2 = hou[i-2] if i>=2 else df['housing_starts'].iloc[-(2-i)]
        hl4 = hou[i-4] if i>=4 else df['housing_starts'].iloc[-(4-i)]
        hcg = (hl2/(hou[max(0,i-6)] if i>=6 else df['housing_starts'].iloc[-(6-i)])-1)
        ml1 = mor[i-1] if i>0 else df['mortgage_30yr'].iloc[-1]
        mcg = mor[i] - (mor[i-4] if i>=4 else df['mortgage_30yr'].iloc[-(4-i)])
        cl1 = cad[i-1] if i>0 else df['cad_usd'].iloc[-1]
        ul1 = une[i-1] if i>0 else df['unemployment_rate'].iloc[-1]
        sl1 = sen[i-1] if i>0 else df['consumer_sentiment'].iloc[-1]
        
        qq = fc_dates[i].quarter; yr = (fc_dates[i].year - 1993) / 31.0
        
        row = np.array([[
            p1, p2, p4, pr4, pcy,
            lpp, lpl, lcy, lfu,
            hl2, hl4, hcg,
            ml1, mcg, cl1, ul1, sl1,
            float(qq==1), float(qq==2), float(qq==3), yr
        ]])
        pred_val = model.predict(row)[0]
        boot_preds[b, i] = pred_val
        hp.append(pred_val)

fc_point = np.median(boot_preds, axis=0)
fc_ci10 = np.percentile(boot_preds, 10, axis=0)
fc_ci90 = np.percentile(boot_preds, 90, axis=0)
fc_ci25 = np.percentile(boot_preds, 25, axis=0)
fc_ci75 = np.percentile(boot_preds, 75, axis=0)

for i, (d, p, lo, hi) in enumerate(zip(fc_dates, fc_point, fc_ci10, fc_ci90)):
    print(f"  Q{d.quarter} {d.year}:  ${p:.0f}/MBF  [${lo:.0f} – ${hi:.0f}]")

# ── Harvest Timing Decision ─────────────────────────────────────────────────
print("\n=== Harvest Timing: 20-Acre Oregon Tract ===\n")

ACRES = 20; MBF_AC = 35; TOTAL_MBF = ACRES * MBF_AC
HOLD_COST_Q = (400 * ACRES) / 4

cp = df[TARGET].iloc[-1]
pct = (df[TARGET] < cp).mean()
hi3 = df[TARGET].tail(12).max(); lo3 = df[TARGET].tail(12).min()

print(f"  Tract:              {ACRES} acres × {MBF_AC} MBF/ac = {TOTAL_MBF:,} MBF")
print(f"  Current price:      ${cp:,.0f}/MBF")
print(f"  Gross value today:  ${cp * TOTAL_MBF:,.0f}")
print(f"  Percentile rank:    {pct*100:.0f}th  |  3-yr range: ${lo3:,.0f}–${hi3:,.0f}/MBF")

# Monte Carlo optimal stopping
n_sims = 50_000
lr = np.diff(np.log(df[TARGET].tail(40).values))
mu_q, sig_q = np.mean(lr), np.std(lr)

paths = cp * np.cumprod(np.exp(mu_q + sig_q * np.random.randn(n_sims, 8)), axis=1)
paths = np.hstack([np.full((n_sims, 1), cp), paths])
net_rev = paths * TOTAL_MBF - (np.arange(9) * HOLD_COST_Q)[np.newaxis, :]
opt_q = np.argmax(net_rev, axis=1)
timing_pct = np.bincount(opt_q, minlength=9) / n_sims * 100
prob_wait = (np.max(net_rev[:, 1:], axis=1) > net_rev[:, 0]).mean()

qlabels = {0:'Now (Q4 2024)', 1:'Q1 2025', 2:'Q2 2025', 3:'Q3 2025',
           4:'Q4 2025', 5:'Q1 2026', 6:'Q2 2026', 7:'Q3 2026', 8:'Q4 2026'}

print(f"\n  Monte Carlo ({n_sims:,} paths, μ={mu_q:.3f}/q, σ={sig_q:.3f}/q):")
print(f"    Prob. waiting improves:           {prob_wait*100:.0f}%")
print(f"    Expected max net revenue:         ${np.mean(np.max(net_rev, axis=1)):,.0f}")
print(f"    Optimal timing distribution:")
for qi, tp in enumerate(timing_pct):
    print(f"      {qlabels[qi]:18s} {tp:5.1f}% {'█'*int(tp/1.5)}")

# Scoring
score = 0; reasons = []
if   pct >= 0.70: score += 2; reasons.append(f"Above-average price ({pct*100:.0f}th percentile)")
elif pct >= 0.50: score += 1; reasons.append(f"Near-average price ({pct*100:.0f}th percentile)")
else:              score -= 1; reasons.append(f"Below-average price ({pct*100:.0f}th percentile)")

if fc_point[1] < cp * 0.97:   score += 1; reasons.append(f"Model forecasts softening to ~${fc_point[1]:.0f}/MBF (Q2 2025)")
elif fc_point[3] > cp * 1.08: score -= 1; reasons.append(f"Model forecasts increase to ~${fc_point[3]:.0f}/MBF in 2025")

if   prob_wait < 0.50:  score += 2; reasons.append(f"Only {prob_wait*100:.0f}% chance waiting improves net revenue")
elif prob_wait > 0.65:  score -= 1; reasons.append(f"{prob_wait*100:.0f}% of simulations show better outcome if you wait")

if cp >= hi3 * 0.80:     score += 1; reasons.append(f"Near 3-yr high ({cp/hi3*100:.0f}% of ${hi3:,.0f} peak)")
elif cp <= lo3 * 1.15:   score -= 2; reasons.append(f"Near 3-yr low — avoid selling now")

sq = df.index[-1].quarter
if sq == 1:   score -= 1; reasons.append("Q1: wet season, poor logging conditions in Pacific NW")
elif sq == 3: score += 1; reasons.append("Q3: peak logging season, favorable market timing")

if   score >= 4: rec, urg, clr = "SELL NOW",     "Strong", "green"
elif score >= 2: rec, urg, clr = "MARKET SOON",  "Moderate", "yellowgreen"
elif score >= 0: rec, urg, clr = "MONITOR — 6mo", "Neutral", "orange"
else:            rec, urg, clr = "WAIT",         "Wait for better market", "red"

print(f"\n{'='*60}")
print(f"  RECOMMENDATION: {rec}  (score {score}/7, {urg})")
print(f"{'='*60}")
for r in reasons: print(f"    • {r}")

# ── Charts ──────────────────────────────────────────────────────────────────
print("\n=== Charts ===")
fig = plt.figure(figsize=(18, 22))
gs = gridspec.GridSpec(4, 2, hspace=0.45, wspace=0.35)
BR, NV, GR, RD = '#6B4226', '#1B3A6B', '#2D8653', '#C0392B'

# [1] History + fit + forecast
ax1 = fig.add_subplot(gs[0, :])
fit = model.predict(X)
ax1.fill_between(feat_df.index, 0, df.loc[feat_df.index, TARGET], alpha=0.1, color=BR)
ax1.plot(feat_df.index, df.loc[feat_df.index, TARGET], color=BR, lw=1.8, label='Historical Stumpage Price', zorder=3)
ax1.plot(feat_df.index, fit, '--', color=NV, lw=1.2, alpha=0.7, label=f'Model Fit (R²={r2_score(y, fit):.2f})', zorder=2)
ax1.fill_between(fc_dates, fc_ci10, fc_ci90, alpha=0.18, color=NV, label='10–90% Forecast')
ax1.fill_between(fc_dates, fc_ci25, fc_ci75, alpha=0.32, color=NV, label='25–75% Forecast')
ax1.plot(fc_dates, fc_point, 'o-', color=NV, lw=2.2, ms=6, label='Forecast (median)', zorder=4)
ax1.axvline(df.index[-1], color='gray', ls=':', lw=1.5, label='Today')
ax1.set_title('Oregon Douglas Fir Stumpage Price: Historical + 8-Quarter Forecast', fontsize=13, fontweight='bold')
ax1.set_ylabel('Price ($/MBF)', fontsize=11)
ax1.legend(fontsize=9, loc='upper left', ncol=3); ax1.grid(True, alpha=0.3)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# [2] Walk-forward CV
ax2 = fig.add_subplot(gs[1, 0])
wf_labels = [f"{feat_df.index[w[0]].strftime('%Y')}" for w in windows]
wf_r2 = []
for tr_e, te_e in windows:
    mp = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
    mp.fit(X[:tr_e], y[:tr_e])
    wf_r2.append(r2_score(y[tr_e:te_e], mp.predict(X[tr_e:te_e])))
clrs = [GR if r > 0 else RD for r in wf_r2]
bars2 = ax2.bar(range(len(windows)), wf_r2, color=clrs, edgecolor='black', lw=0.5)
for b, r in zip(bars2, wf_r2):
    ax2.text(b.get_x()+b.get_width()/2, r+0.03 if r>=0 else r-0.08, f'{r:.2f}', ha='center', fontsize=9, fontweight='bold')
ax2.axhline(ovr_r2, color=NV, ls='--', lw=1.5, label=f'Overall R²={ovr_r2:.2f}')
ax2.axhline(0, color='black', lw=0.8)
ax2.set_xticks(range(len(windows))); ax2.set_xticklabels(wf_labels, fontsize=9)
ax2.set_title('Walk-Forward Validation R² (2015–2024)', fontsize=12)
ax2.set_ylabel('R²'); ax2.legend(fontsize=9); ax2.grid(True, alpha=0.3, axis='y')

# [3] Feature importance
ax3 = fig.add_subplot(gs[1, 1])
coef = np.abs(model.named_steps['ridge'].coef_)
fi = pd.DataFrame({'feature': X_cols, 'importance': coef}).sort_values('importance', ascending=True).tail(12)
ax3.barh(range(len(fi)), fi['importance'], color=plt.cm.RdYlGn(fi['importance']/fi['importance'].max()), edgecolor='black', lw=0.3)
ax3.set_yticks(range(len(fi))); ax3.set_yticklabels([f.replace('_',' ') for f in fi['feature']], fontsize=9)
ax3.set_title('Feature Importance (|coef|)\nTop 12 Price Drivers', fontsize=12); ax3.grid(True, alpha=0.3, axis='x')

# [4] Price distribution
ax4 = fig.add_subplot(gs[2, 0])
ax4.hist(df[TARGET], bins=25, color=BR, alpha=0.6, edgecolor='black', lw=0.3, density=True)
ax4.axvline(cp, color=NV, lw=2.5, label=f'Current: ${cp:.0f}/MBF ({pct*100:.0f}th)')
ax4.axvline(np.percentile(df[TARGET], 75), color=GR, ls='--', lw=1.5, label=f'75th: ${np.percentile(df[TARGET], 75):.0f}')
ax4.axvline(np.percentile(df[TARGET], 25), color=RD, ls='--', lw=1.5, label=f'25th: ${np.percentile(df[TARGET], 25):.0f}')
ax4.set_title('Price Distribution (1993–2024)', fontsize=12); ax4.legend(fontsize=8); ax4.grid(True, alpha=0.3)
ax4.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

# [5] MC timing
ax5 = fig.add_subplot(gs[2, 1])
bar_clrs5 = [GR if p==max(timing_pct) else NV for p in timing_pct]
bars5 = ax5.bar(range(9), timing_pct, color=bar_clrs5, edgecolor='black', lw=0.5)
for b, tp in zip(bars5, timing_pct):
    ax5.text(b.get_x()+b.get_width()/2, b.get_height()+0.3, f'{tp:.0f}%', ha='center', fontsize=8)
ax5.set_xticks(range(9)); ax5.set_xticklabels([qlabels[i].replace(' ','\n') for i in range(9)], fontsize=7)
ax5.set_title(f'Optimal Sell Timing ({n_sims:,} MC paths)', fontsize=12); ax5.grid(True, alpha=0.3, axis='y')

# [6] Revenue sensitivity
ax6 = fig.add_subplot(gs[3, :])
pr = np.linspace(100, 900, 300)
ax6.plot(pr, pr*TOTAL_MBF/1000, color=GR, lw=2, label='Sell now')
ax6.plot(pr, (pr*TOTAL_MBF - HOLD_COST_Q*4)/1000, '--', color='orange', lw=1.8, label='After 1 yr hold')
ax6.plot(pr, (pr*TOTAL_MBF - HOLD_COST_Q*8)/1000, '--', color=RD, lw=1.8, label='After 2 yr hold')
ax6.fill_between(pr, (pr*TOTAL_MBF - HOLD_COST_Q*4)/1000, pr*TOTAL_MBF/1000, alpha=0.12, color=RD)
ax6.axvline(cp, color=NV, lw=2.5, ls=':', label=f'Current: ${cp:.0f}/MBF → ${cp*TOTAL_MBF/1000:.0f}K')
ax6.axvspan(np.min(fc_ci10), np.max(fc_ci90), alpha=0.07, color=NV, label=f'2025–2026 forecast range')
ax6.text(0.74, 0.92, f"RECOMMENDATION\n{rec}\n{urg} (score {score}/7)", transform=ax6.transAxes,
         fontsize=12, fontweight='bold', ha='center', va='top',
         bbox=dict(boxstyle='round,pad=0.6', facecolor=clr, alpha=0.25, edgecolor=clr, lw=2))
ax6.set_title(f'Revenue Sensitivity ({ACRES} ac × {MBF_AC} MBF/ac = {TOTAL_MBF:,} MBF)', fontsize=12)
ax6.set_xlabel('$/MBF'); ax6.legend(fontsize=9, loc='upper left', ncol=2); ax6.grid(True, alpha=0.3)
ax6.xaxis.set_major_formatter(plt.FuncFormatter(lambda x,p: f'${x:,.0f}'))
ax6.yaxis.set_major_formatter(plt.FuncFormatter(lambda x,p: f'${x:,.0f}K'))
ax6.set_xlim(100, 900)

fig.suptitle('Oregon Timber Market Timing Analysis\n20-Acre Douglas Fir Tract | Ridge Regression + Monte Carlo',
             fontsize=15, fontweight='bold', y=0.995)
plt.savefig(f'{OUT_DIR}/timber_analysis.png', dpi=150, bbox_inches='tight')
plt.close()
print("  ✓ timber_analysis.png")

# ── Save outputs ────────────────────────────────────────────────────────────
fc = pd.DataFrame({
    'quarter': [f"Q{d.quarter} {d.year}" for d in fc_dates],
    'date': fc_dates,
    'price_median': fc_point.round(0), 'price_ci10': fc_ci10.round(0),
    'price_ci25': fc_ci25.round(0), 'price_ci75': fc_ci75.round(0), 'price_ci90': fc_ci90.round(0),
    'gross_value_median': (fc_point * TOTAL_MBF).round(0),
    'gross_value_ci10': (fc_ci10 * TOTAL_MBF).round(0),
    'gross_value_ci90': (fc_ci90 * TOTAL_MBF).round(0),
})
fc.to_csv(f'{OUT_DIR}/forecast_results.csv', index=False)

wf_r2_all = []
for tr_e, te_e in windows:
    m2 = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
    m2.fit(X[:tr_e], y[:tr_e])
    wf_r2_all.append({'period': f"{feat_df.index[tr_e].strftime('%Y')}-{feat_df.index[min(te_e,n)-1].strftime('%Y')}",
                      'R2': r2_score(y[tr_e:te_e], m2.predict(X[tr_e:te_e]))})

report = f"""OREGON TIMBER MARKET TIMING REPORT
=================================
Date: {pd.Timestamp.now().strftime('%B %d, %Y')}
Tract: {ACRES} acres | {TOTAL_MBF:,} MBF

MODEL PERFORMANCE (Walk-Forward CV, 2015–2024)
Overall R²: {ovr_r2:.3f} | MAE: ${ovr_mae:.0f}/MBF
Per-window: {' | '.join(f"{w['period']} R²={w['R2']:+.2f}" for w in wf_r2_all)}

MARKET: ${cp:,.0f}/MBF (pct {pct*100:.0f}) | 3yr hi ${hi3:,.0f}, lo ${lo3:,.0f}

8Q FORECAST:
{fc[['quarter','price_median','price_ci10','price_ci90']].to_string(index=False)}

RECOMMENDATION: {rec} (score {score}/7, {urg})
{chr(10).join('  • '+r for r in reasons)}

MC: μ={mu_q:+.3f}/q σ={sig_q:.3f}/q | P(wait better)={prob_wait*100:.0f}%

KEY DRIVERS:
{fi.sort_values('importance', ascending=False).head(8)[['feature','importance']].to_string(index=False)}

PRODUCTION NOTES:
  - Replace synthetic data with ODF quarterly records (oregon.gov/ODF)
  - Add FRED API: housing starts (HOUST1F), lumber PPI (WPU0811), mortgage (MORTGAGE30US)
  - Subscribe to Random Lengths weekly composite (randomlengths.com)
  - Cruise survey your stand for actual MBF/acre (±30% from default)
  - Re-run quarterly as new data arrives
"""
with open(f'{OUT_DIR}/timber_report.txt', 'w') as f: f.write(report)

print(f"\n{'='*60}")
print(f"  RECOMMENDATION: {rec}  (score {score}/7, {urg})")
print(f"{'='*60}")
print(f"\nOutputs: timber_analysis.png, forecast_results.csv, timber_report.txt")