Upload timber_final.py with huggingface_hub

112c740 verified 5 days ago

19.4 kB

	"""
	Oregon Timber Market Timing Model — Final Implementation
	=========================================================
	Predicts Douglas fir stumpage prices and recommends optimal marketing timing
	for a 20-acre Oregon timber tract using Ridge regression with walk-forward validation
	and Monte Carlo optimal stopping.

	Based on: Faustmann rotation model (1849), Clarke & Reed real-options extension (1989),
	and Pacific NW timber economics literature (USDA PNW-GTR-423, PNW-RP-436).
	"""
	import pandas as pd
	import numpy as np
	import matplotlib; matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import matplotlib.gridspec as gridspec
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import Pipeline
	from sklearn.linear_model import Ridge
	from sklearn.metrics import r2_score, mean_absolute_error
	import json, warnings
	warnings.filterwarnings('ignore')

	np.random.seed(42)
	OUT_DIR = '/app'

	# ── Load ────────────────────────────────────────────────────────────────────
	df = pd.read_csv(f'{OUT_DIR}/oregon_timber_data.csv', parse_dates=['date'])
	df = df.set_index('date')
	TARGET = 'stumpage_price_mbf'
	print(f"Loaded: {len(df)} quarters, {df.index[0].date()} → {df.index[-1].date()}")
	print(f"Price: ${df[TARGET].min():.0f}–${df[TARGET].max():.0f}/MBF, μ=${df[TARGET].mean():.0f}\n")

	# ── Features ─────────────────────────────────────────────────────────────────
	def build_features(df_):
	p = df_[TARGET]
	out = pd.DataFrame({
	'price_lag1': p.shift(1),
	'price_lag2': p.shift(2),
	'price_lag4': p.shift(4),
	'price_roll4': p.rolling(4).mean().shift(1),
	'price_chg_yoy': p.pct_change(4).shift(1),
	'lumber_ppi': df_['lumber_ppi'],
	'lumber_ppi_lag1': df_['lumber_ppi'].shift(1),
	'lumber_chg_yoy': df_['lumber_ppi'].pct_change(4),
	'lumber_futures': df_['lumber_futures_mbf'],
	'housing_lag2': df_['housing_starts'].shift(2),
	'housing_lag4': df_['housing_starts'].shift(4),
	'housing_chg': df_['housing_starts'].pct_change(4).shift(2),
	'mortgage_lag1': df_['mortgage_30yr'].shift(1),
	'mortgage_chg': (df_['mortgage_30yr'] - df_['mortgage_30yr'].shift(4)).shift(1),
	'cad_lag1': df_['cad_usd'].shift(1),
	'unemployment': df_['unemployment_rate'].shift(1),
	'sentiment': df_['consumer_sentiment'].shift(1),
	}, index=df_.index)
	q = df_.index.quarter
	out['q1'] = (q == 1).astype(float)
	out['q2'] = (q == 2).astype(float)
	out['q3'] = (q == 3).astype(float)
	out['year_norm'] = (df_.index.year - 1993) / 31.0
	out[TARGET] = df_[TARGET]
	return out.dropna()

	feat_df = build_features(df)
	X_cols = [c for c in feat_df.columns if c != TARGET]
	X, y = feat_df[X_cols].values, feat_df[TARGET].values
	n = len(feat_df)
	print(f"Features: {len(X_cols)} \| Training obs: {n}")

	# ── Walk-forward CV ─────────────────────────────────────────────────────────
	print("\n=== Walk-Forward Validation (2015–2024) ===\n")
	windows = [(80,88), (88,96), (96,104), (104,112), (112,n)]
	actual, pred = [], []

	for tr_end, te_end in windows:
	m = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
	m.fit(X[:tr_end], y[:tr_end])
	yp = m.predict(X[tr_end:te_end])
	r2v = r2_score(y[tr_end:te_end], yp)
	actual.extend(y[tr_end:te_end]); pred.extend(yp)
	print(f" Train={tr_end} Test={feat_df.index[tr_end].strftime('%Y')}→{feat_df.index[min(te_end,n)-1].strftime('%Y')} "
	f"({te_end-tr_end}q) \| R²={r2v:+.3f} MAE=${mean_absolute_error(y[tr_end:te_end], yp):.0f}/MBF")

	ovr_r2 = r2_score(actual, pred)
	ovr_mae = mean_absolute_error(actual, pred)
	print(f"\n Overall R²={ovr_r2:.3f} MAE=${ovr_mae:.0f}/MBF (explains {ovr_r2*100:.0f}% of out-of-sample variance)")

	# ── Final model ─────────────────────────────────────────────────────────────
	model = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
	model.fit(X, y)
	print(f"\nFinal model (full dataset) R² = {r2_score(y, model.predict(X)):.3f}")

	# ── 8-Quarter Forecast ──────────────────────────────────────────────────────
	print("\n=== 8-Quarter Forecast ===\n")

	def project_series(s, n_ahead, reversion=0.1):
	v = s.iloc[-1]
	m = s.tail(20).mean()
	std = s.tail(20).std()
	out = []
	for _ in range(n_ahead):
	v = v * (1 - reversion) + m * reversion + np.random.normal(0, std * 0.05)
	out.append(v)
	return out

	n_ahead = 8
	fc_dates = pd.date_range(df.index[-1] + pd.DateOffset(months=3), periods=n_ahead, freq='QS')
	n_boot = 500
	boot_preds = np.zeros((n_boot, n_ahead))

	for b in range(n_boot):
	lum = project_series(df['lumber_ppi'], n_ahead, reversion=0.08)
	luf = project_series(df['lumber_futures_mbf'], n_ahead, reversion=0.08)
	hou = project_series(df['housing_starts'], n_ahead, reversion=0.06)
	mor = project_series(df['mortgage_30yr'], n_ahead, reversion=0.05)
	cad = project_series(df['cad_usd'], n_ahead, reversion=0.05)
	une = project_series(df['unemployment_rate'], n_ahead, reversion=0.05)
	sen = project_series(df['consumer_sentiment'], n_ahead, reversion=0.05)

	hp = list(df[TARGET].values) # historical prices, grows as we forecast

	for i in range(n_ahead):
	p1 = hp[-1]; p2 = hp[-2]; p4 = hp[-4] if len(hp)>=4 else hp[-1]
	pr4 = np.mean(hp[-4:])
	pcy = (hp[-1]/hp[-5]-1) if len(hp)>=5 else 0
	lpp = lum[i]; lpl = lum[i-1] if i>0 else df['lumber_ppi'].iloc[-1]
	lcy = (lum[i]/(lum[i-4] if i>=4 else df['lumber_ppi'].iloc[-(4-i)])-1)
	lfu = luf[i]
	hl2 = hou[i-2] if i>=2 else df['housing_starts'].iloc[-(2-i)]
	hl4 = hou[i-4] if i>=4 else df['housing_starts'].iloc[-(4-i)]
	hcg = (hl2/(hou[max(0,i-6)] if i>=6 else df['housing_starts'].iloc[-(6-i)])-1)
	ml1 = mor[i-1] if i>0 else df['mortgage_30yr'].iloc[-1]
	mcg = mor[i] - (mor[i-4] if i>=4 else df['mortgage_30yr'].iloc[-(4-i)])
	cl1 = cad[i-1] if i>0 else df['cad_usd'].iloc[-1]
	ul1 = une[i-1] if i>0 else df['unemployment_rate'].iloc[-1]
	sl1 = sen[i-1] if i>0 else df['consumer_sentiment'].iloc[-1]

	qq = fc_dates[i].quarter; yr = (fc_dates[i].year - 1993) / 31.0

	row = np.array([[
	p1, p2, p4, pr4, pcy,
	lpp, lpl, lcy, lfu,
	hl2, hl4, hcg,
	ml1, mcg, cl1, ul1, sl1,
	float(qq==1), float(qq==2), float(qq==3), yr
	]])
	pred_val = model.predict(row)[0]
	boot_preds[b, i] = pred_val
	hp.append(pred_val)

	fc_point = np.median(boot_preds, axis=0)
	fc_ci10 = np.percentile(boot_preds, 10, axis=0)
	fc_ci90 = np.percentile(boot_preds, 90, axis=0)
	fc_ci25 = np.percentile(boot_preds, 25, axis=0)
	fc_ci75 = np.percentile(boot_preds, 75, axis=0)

	for i, (d, p, lo, hi) in enumerate(zip(fc_dates, fc_point, fc_ci10, fc_ci90)):
	print(f" Q{d.quarter} {d.year}: ${p:.0f}/MBF [${lo:.0f} – ${hi:.0f}]")

	# ── Harvest Timing Decision ─────────────────────────────────────────────────
	print("\n=== Harvest Timing: 20-Acre Oregon Tract ===\n")

	ACRES = 20; MBF_AC = 35; TOTAL_MBF = ACRES * MBF_AC
	HOLD_COST_Q = (400 * ACRES) / 4

	cp = df[TARGET].iloc[-1]
	pct = (df[TARGET] < cp).mean()
	hi3 = df[TARGET].tail(12).max(); lo3 = df[TARGET].tail(12).min()

	print(f" Tract: {ACRES} acres × {MBF_AC} MBF/ac = {TOTAL_MBF:,} MBF")
	print(f" Current price: ${cp:,.0f}/MBF")
	print(f" Gross value today: ${cp * TOTAL_MBF:,.0f}")
	print(f" Percentile rank: {pct*100:.0f}th \| 3-yr range: ${lo3:,.0f}–${hi3:,.0f}/MBF")

	# Monte Carlo optimal stopping
	n_sims = 50_000
	lr = np.diff(np.log(df[TARGET].tail(40).values))
	mu_q, sig_q = np.mean(lr), np.std(lr)

	paths = cp * np.cumprod(np.exp(mu_q + sig_q * np.random.randn(n_sims, 8)), axis=1)
	paths = np.hstack([np.full((n_sims, 1), cp), paths])
	net_rev = paths * TOTAL_MBF - (np.arange(9) * HOLD_COST_Q)[np.newaxis, :]
	opt_q = np.argmax(net_rev, axis=1)
	timing_pct = np.bincount(opt_q, minlength=9) / n_sims * 100
	prob_wait = (np.max(net_rev[:, 1:], axis=1) > net_rev[:, 0]).mean()

	qlabels = {0:'Now (Q4 2024)', 1:'Q1 2025', 2:'Q2 2025', 3:'Q3 2025',
	4:'Q4 2025', 5:'Q1 2026', 6:'Q2 2026', 7:'Q3 2026', 8:'Q4 2026'}

	print(f"\n Monte Carlo ({n_sims:,} paths, μ={mu_q:.3f}/q, σ={sig_q:.3f}/q):")
	print(f" Prob. waiting improves: {prob_wait*100:.0f}%")
	print(f" Expected max net revenue: ${np.mean(np.max(net_rev, axis=1)):,.0f}")
	print(f" Optimal timing distribution:")
	for qi, tp in enumerate(timing_pct):
	print(f" {qlabels[qi]:18s} {tp:5.1f}% {'█'*int(tp/1.5)}")

	# Scoring
	score = 0; reasons = []
	if pct >= 0.70: score += 2; reasons.append(f"Above-average price ({pct*100:.0f}th percentile)")
	elif pct >= 0.50: score += 1; reasons.append(f"Near-average price ({pct*100:.0f}th percentile)")
	else: score -= 1; reasons.append(f"Below-average price ({pct*100:.0f}th percentile)")

	if fc_point[1] < cp * 0.97: score += 1; reasons.append(f"Model forecasts softening to ~${fc_point[1]:.0f}/MBF (Q2 2025)")
	elif fc_point[3] > cp * 1.08: score -= 1; reasons.append(f"Model forecasts increase to ~${fc_point[3]:.0f}/MBF in 2025")

	if prob_wait < 0.50: score += 2; reasons.append(f"Only {prob_wait*100:.0f}% chance waiting improves net revenue")
	elif prob_wait > 0.65: score -= 1; reasons.append(f"{prob_wait*100:.0f}% of simulations show better outcome if you wait")

	if cp >= hi3 * 0.80: score += 1; reasons.append(f"Near 3-yr high ({cp/hi3*100:.0f}% of ${hi3:,.0f} peak)")
	elif cp <= lo3 * 1.15: score -= 2; reasons.append(f"Near 3-yr low — avoid selling now")

	sq = df.index[-1].quarter
	if sq == 1: score -= 1; reasons.append("Q1: wet season, poor logging conditions in Pacific NW")
	elif sq == 3: score += 1; reasons.append("Q3: peak logging season, favorable market timing")

	if score >= 4: rec, urg, clr = "SELL NOW", "Strong", "green"
	elif score >= 2: rec, urg, clr = "MARKET SOON", "Moderate", "yellowgreen"
	elif score >= 0: rec, urg, clr = "MONITOR — 6mo", "Neutral", "orange"
	else: rec, urg, clr = "WAIT", "Wait for better market", "red"

	print(f"\n{'='*60}")
	print(f" RECOMMENDATION: {rec} (score {score}/7, {urg})")
	print(f"{'='*60}")
	for r in reasons: print(f" • {r}")

	# ── Charts ──────────────────────────────────────────────────────────────────
	print("\n=== Charts ===")
	fig = plt.figure(figsize=(18, 22))
	gs = gridspec.GridSpec(4, 2, hspace=0.45, wspace=0.35)
	BR, NV, GR, RD = '#6B4226', '#1B3A6B', '#2D8653', '#C0392B'

	# [1] History + fit + forecast
	ax1 = fig.add_subplot(gs[0, :])
	fit = model.predict(X)
	ax1.fill_between(feat_df.index, 0, df.loc[feat_df.index, TARGET], alpha=0.1, color=BR)
	ax1.plot(feat_df.index, df.loc[feat_df.index, TARGET], color=BR, lw=1.8, label='Historical Stumpage Price', zorder=3)
	ax1.plot(feat_df.index, fit, '--', color=NV, lw=1.2, alpha=0.7, label=f'Model Fit (R²={r2_score(y, fit):.2f})', zorder=2)
	ax1.fill_between(fc_dates, fc_ci10, fc_ci90, alpha=0.18, color=NV, label='10–90% Forecast')
	ax1.fill_between(fc_dates, fc_ci25, fc_ci75, alpha=0.32, color=NV, label='25–75% Forecast')
	ax1.plot(fc_dates, fc_point, 'o-', color=NV, lw=2.2, ms=6, label='Forecast (median)', zorder=4)
	ax1.axvline(df.index[-1], color='gray', ls=':', lw=1.5, label='Today')
	ax1.set_title('Oregon Douglas Fir Stumpage Price: Historical + 8-Quarter Forecast', fontsize=13, fontweight='bold')
	ax1.set_ylabel('Price ($/MBF)', fontsize=11)
	ax1.legend(fontsize=9, loc='upper left', ncol=3); ax1.grid(True, alpha=0.3)
	ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

	# [2] Walk-forward CV
	ax2 = fig.add_subplot(gs[1, 0])
	wf_labels = [f"{feat_df.index[w[0]].strftime('%Y')}" for w in windows]
	wf_r2 = []
	for tr_e, te_e in windows:
	mp = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
	mp.fit(X[:tr_e], y[:tr_e])
	wf_r2.append(r2_score(y[tr_e:te_e], mp.predict(X[tr_e:te_e])))
	clrs = [GR if r > 0 else RD for r in wf_r2]
	bars2 = ax2.bar(range(len(windows)), wf_r2, color=clrs, edgecolor='black', lw=0.5)
	for b, r in zip(bars2, wf_r2):
	ax2.text(b.get_x()+b.get_width()/2, r+0.03 if r>=0 else r-0.08, f'{r:.2f}', ha='center', fontsize=9, fontweight='bold')
	ax2.axhline(ovr_r2, color=NV, ls='--', lw=1.5, label=f'Overall R²={ovr_r2:.2f}')
	ax2.axhline(0, color='black', lw=0.8)
	ax2.set_xticks(range(len(windows))); ax2.set_xticklabels(wf_labels, fontsize=9)
	ax2.set_title('Walk-Forward Validation R² (2015–2024)', fontsize=12)
	ax2.set_ylabel('R²'); ax2.legend(fontsize=9); ax2.grid(True, alpha=0.3, axis='y')

	# [3] Feature importance
	ax3 = fig.add_subplot(gs[1, 1])
	coef = np.abs(model.named_steps['ridge'].coef_)
	fi = pd.DataFrame({'feature': X_cols, 'importance': coef}).sort_values('importance', ascending=True).tail(12)
	ax3.barh(range(len(fi)), fi['importance'], color=plt.cm.RdYlGn(fi['importance']/fi['importance'].max()), edgecolor='black', lw=0.3)
	ax3.set_yticks(range(len(fi))); ax3.set_yticklabels([f.replace('_',' ') for f in fi['feature']], fontsize=9)
	ax3.set_title('Feature Importance (\|coef\|)\nTop 12 Price Drivers', fontsize=12); ax3.grid(True, alpha=0.3, axis='x')

	# [4] Price distribution
	ax4 = fig.add_subplot(gs[2, 0])
	ax4.hist(df[TARGET], bins=25, color=BR, alpha=0.6, edgecolor='black', lw=0.3, density=True)
	ax4.axvline(cp, color=NV, lw=2.5, label=f'Current: ${cp:.0f}/MBF ({pct*100:.0f}th)')
	ax4.axvline(np.percentile(df[TARGET], 75), color=GR, ls='--', lw=1.5, label=f'75th: ${np.percentile(df[TARGET], 75):.0f}')
	ax4.axvline(np.percentile(df[TARGET], 25), color=RD, ls='--', lw=1.5, label=f'25th: ${np.percentile(df[TARGET], 25):.0f}')
	ax4.set_title('Price Distribution (1993–2024)', fontsize=12); ax4.legend(fontsize=8); ax4.grid(True, alpha=0.3)
	ax4.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x:,.0f}'))

	# [5] MC timing
	ax5 = fig.add_subplot(gs[2, 1])
	bar_clrs5 = [GR if p==max(timing_pct) else NV for p in timing_pct]
	bars5 = ax5.bar(range(9), timing_pct, color=bar_clrs5, edgecolor='black', lw=0.5)
	for b, tp in zip(bars5, timing_pct):
	ax5.text(b.get_x()+b.get_width()/2, b.get_height()+0.3, f'{tp:.0f}%', ha='center', fontsize=8)
	ax5.set_xticks(range(9)); ax5.set_xticklabels([qlabels[i].replace(' ','\n') for i in range(9)], fontsize=7)
	ax5.set_title(f'Optimal Sell Timing ({n_sims:,} MC paths)', fontsize=12); ax5.grid(True, alpha=0.3, axis='y')

	# [6] Revenue sensitivity
	ax6 = fig.add_subplot(gs[3, :])
	pr = np.linspace(100, 900, 300)
	ax6.plot(pr, pr*TOTAL_MBF/1000, color=GR, lw=2, label='Sell now')
	ax6.plot(pr, (prTOTAL_MBF - HOLD_COST_Q4)/1000, '--', color='orange', lw=1.8, label='After 1 yr hold')
	ax6.plot(pr, (prTOTAL_MBF - HOLD_COST_Q8)/1000, '--', color=RD, lw=1.8, label='After 2 yr hold')
	ax6.fill_between(pr, (prTOTAL_MBF - HOLD_COST_Q4)/1000, pr*TOTAL_MBF/1000, alpha=0.12, color=RD)
	ax6.axvline(cp, color=NV, lw=2.5, ls=':', label=f'Current: ${cp:.0f}/MBF → ${cp*TOTAL_MBF/1000:.0f}K')
	ax6.axvspan(np.min(fc_ci10), np.max(fc_ci90), alpha=0.07, color=NV, label=f'2025–2026 forecast range')
	ax6.text(0.74, 0.92, f"RECOMMENDATION\n{rec}\n{urg} (score {score}/7)", transform=ax6.transAxes,
	fontsize=12, fontweight='bold', ha='center', va='top',
	bbox=dict(boxstyle='round,pad=0.6', facecolor=clr, alpha=0.25, edgecolor=clr, lw=2))
	ax6.set_title(f'Revenue Sensitivity ({ACRES} ac × {MBF_AC} MBF/ac = {TOTAL_MBF:,} MBF)', fontsize=12)
	ax6.set_xlabel('$/MBF'); ax6.legend(fontsize=9, loc='upper left', ncol=2); ax6.grid(True, alpha=0.3)
	ax6.xaxis.set_major_formatter(plt.FuncFormatter(lambda x,p: f'${x:,.0f}'))
	ax6.yaxis.set_major_formatter(plt.FuncFormatter(lambda x,p: f'${x:,.0f}K'))
	ax6.set_xlim(100, 900)

	fig.suptitle('Oregon Timber Market Timing Analysis\n20-Acre Douglas Fir Tract \| Ridge Regression + Monte Carlo',
	fontsize=15, fontweight='bold', y=0.995)
	plt.savefig(f'{OUT_DIR}/timber_analysis.png', dpi=150, bbox_inches='tight')
	plt.close()
	print(" ✓ timber_analysis.png")

	# ── Save outputs ────────────────────────────────────────────────────────────
	fc = pd.DataFrame({
	'quarter': [f"Q{d.quarter} {d.year}" for d in fc_dates],
	'date': fc_dates,
	'price_median': fc_point.round(0), 'price_ci10': fc_ci10.round(0),
	'price_ci25': fc_ci25.round(0), 'price_ci75': fc_ci75.round(0), 'price_ci90': fc_ci90.round(0),
	'gross_value_median': (fc_point * TOTAL_MBF).round(0),
	'gross_value_ci10': (fc_ci10 * TOTAL_MBF).round(0),
	'gross_value_ci90': (fc_ci90 * TOTAL_MBF).round(0),
	})
	fc.to_csv(f'{OUT_DIR}/forecast_results.csv', index=False)

	wf_r2_all = []
	for tr_e, te_e in windows:
	m2 = Pipeline([('sc', StandardScaler()), ('ridge', Ridge(alpha=5.0))])
	m2.fit(X[:tr_e], y[:tr_e])
	wf_r2_all.append({'period': f"{feat_df.index[tr_e].strftime('%Y')}-{feat_df.index[min(te_e,n)-1].strftime('%Y')}",
	'R2': r2_score(y[tr_e:te_e], m2.predict(X[tr_e:te_e]))})

	report = f"""OREGON TIMBER MARKET TIMING REPORT
	=================================
	Date: {pd.Timestamp.now().strftime('%B %d, %Y')}
	Tract: {ACRES} acres \| {TOTAL_MBF:,} MBF

	MODEL PERFORMANCE (Walk-Forward CV, 2015–2024)
	Overall R²: {ovr_r2:.3f} \| MAE: ${ovr_mae:.0f}/MBF
	Per-window: {' \| '.join(f"{w['period']} R²={w['R2']:+.2f}" for w in wf_r2_all)}

	MARKET: ${cp:,.0f}/MBF (pct {pct*100:.0f}) \| 3yr hi ${hi3:,.0f}, lo ${lo3:,.0f}

	8Q FORECAST:
	{fc[['quarter','price_median','price_ci10','price_ci90']].to_string(index=False)}

	RECOMMENDATION: {rec} (score {score}/7, {urg})
	{chr(10).join(' • '+r for r in reasons)}

	MC: μ={mu_q:+.3f}/q σ={sig_q:.3f}/q \| P(wait better)={prob_wait*100:.0f}%

	KEY DRIVERS:
	{fi.sort_values('importance', ascending=False).head(8)[['feature','importance']].to_string(index=False)}

	PRODUCTION NOTES:
	- Replace synthetic data with ODF quarterly records (oregon.gov/ODF)
	- Add FRED API: housing starts (HOUST1F), lumber PPI (WPU0811), mortgage (MORTGAGE30US)
	- Subscribe to Random Lengths weekly composite (randomlengths.com)
	- Cruise survey your stand for actual MBF/acre (±30% from default)
	- Re-run quarterly as new data arrives
	"""
	with open(f'{OUT_DIR}/timber_report.txt', 'w') as f: f.write(report)

	print(f"\n{'='*60}")
	print(f" RECOMMENDATION: {rec} (score {score}/7, {urg})")
	print(f"{'='*60}")
	print(f"\nOutputs: timber_analysis.png, forecast_results.csv, timber_report.txt")