| | |
| | """ |
| | Run Statistical Modeling: Linear regression, diagnostics, p-values, confidence intervals, plots |
| | """ |
| | import os |
| | import sys |
| | import glob |
| | sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src')) |
| | import pandas as pd |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from sklearn.linear_model import LinearRegression |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.preprocessing import StandardScaler |
| | import statsmodels.api as sm |
| | from scipy import stats |
| | from statsmodels.stats.diagnostic import het_breuschpagan |
| | from statsmodels.stats.outliers_influence import variance_inflation_factor |
| |
|
| | def find_latest_data(): |
| | data_files = glob.glob('data/processed/fred_data_*.csv') |
| | if not data_files: |
| | raise FileNotFoundError("No FRED data files found. Run the pipeline first.") |
| | return max(data_files, key=os.path.getctime) |
| |
|
| | def main(): |
| | print("="*60) |
| | print("FRED Statistical Modeling: Linear Regression & Diagnostics") |
| | print("="*60) |
| | data_file = find_latest_data() |
| | print(f"Using data file: {data_file}") |
| | df = pd.read_csv(data_file, index_col=0, parse_dates=True) |
| | df_clean = df.dropna() |
| | target_var = 'GDP' |
| | if target_var not in df_clean.columns: |
| | print(f"Target variable '{target_var}' not found in data.") |
| | return |
| | feature_cols = [col for col in df_clean.columns if col != target_var] |
| | X = df_clean[feature_cols] |
| | y = df_clean[target_var] |
| | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| | |
| | model = LinearRegression() |
| | model.fit(X_train, y_train) |
| | y_pred_train = model.predict(X_train) |
| | y_pred_test = model.predict(X_test) |
| | |
| | r2_train = model.score(X_train, y_train) |
| | r2_test = model.score(X_test, y_test) |
| | print(f"R² (Train): {r2_train:.4f} | R² (Test): {r2_test:.4f}") |
| | |
| | print("\nCoefficients:") |
| | for feature, coef in zip(feature_cols, model.coef_): |
| | print(f" {feature}: {coef:.4f}") |
| | print(f" Intercept: {model.intercept_:.4f}") |
| | |
| | X_with_const = sm.add_constant(X_train) |
| | model_sm = sm.OLS(y_train, X_with_const).fit() |
| | print("\nStatistical Significance:") |
| | print(model_sm.summary().tables[1]) |
| | |
| | with open('data/exports/regression_summary.txt', 'w') as f: |
| | f.write(str(model_sm.summary())) |
| | |
| | residuals = y_train - y_pred_train |
| | |
| | _, p_value_norm = stats.normaltest(residuals) |
| | print(f"Normality test (p-value): {p_value_norm:.4f}") |
| | |
| | vif_data = [] |
| | for i in range(X_train.shape[1]): |
| | try: |
| | vif = variance_inflation_factor(X_train.values, i) |
| | vif_data.append(vif) |
| | except: |
| | vif_data.append(np.nan) |
| | print("\nVariance Inflation Factors:") |
| | for feature, vif in zip(feature_cols, vif_data): |
| | print(f" {feature}: {vif:.3f}") |
| | |
| | try: |
| | _, p_value_het = het_breuschpagan(residuals, X_with_const) |
| | print(f"Homoscedasticity test (p-value): {p_value_het:.4f}") |
| | except: |
| | print("Homoscedasticity test failed") |
| | |
| | plt.figure(figsize=(12,4)) |
| | plt.subplot(1,2,1) |
| | plt.scatter(y_pred_train, residuals, alpha=0.5) |
| | plt.axhline(0, color='red', linestyle='--') |
| | plt.xlabel('Fitted Values') |
| | plt.ylabel('Residuals') |
| | plt.title('Residuals vs Fitted') |
| | plt.subplot(1,2,2) |
| | stats.probplot(residuals, dist="norm", plot=plt) |
| | plt.title('Normal Q-Q') |
| | plt.tight_layout() |
| | plt.savefig('data/exports/regression_diagnostics.png', dpi=200) |
| | plt.close() |
| | print("\nStatistical modeling complete. Outputs saved to data/exports/.") |
| |
|
| | if __name__ == "__main__": |
| | main() |