|
|
|
|
|
""" |
|
|
Advanced Analytics Module for FRED Economic Data |
|
|
Performs comprehensive statistical analysis, modeling, and insights extraction. |
|
|
""" |
|
|
|
|
|
import warnings |
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
import statsmodels.api as sm |
|
|
from scipy import stats |
|
|
from sklearn.cluster import KMeans |
|
|
from sklearn.decomposition import PCA |
|
|
from sklearn.linear_model import LinearRegression |
|
|
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from statsmodels.stats.diagnostic import het_breuschpagan |
|
|
from statsmodels.stats.outliers_influence import variance_inflation_factor |
|
|
from statsmodels.tsa.arima.model import ARIMA |
|
|
from statsmodels.tsa.seasonal import seasonal_decompose |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
class AdvancedAnalytics: |
|
|
""" |
|
|
Comprehensive analytics class for FRED economic data. |
|
|
Performs EDA, statistical modeling, segmentation, and time series analysis. |
|
|
""" |
|
|
|
|
|
def __init__(self, data_path=None, df=None): |
|
|
"""Initialize with data path or DataFrame.""" |
|
|
if df is not None: |
|
|
self.df = df |
|
|
elif data_path: |
|
|
self.df = pd.read_csv(data_path, index_col=0, parse_dates=True) |
|
|
else: |
|
|
raise ValueError("Must provide either data_path or DataFrame") |
|
|
|
|
|
self.scaler = StandardScaler() |
|
|
self.results = {} |
|
|
|
|
|
def perform_eda(self): |
|
|
"""Perform comprehensive Exploratory Data Analysis.""" |
|
|
print("=" * 60) |
|
|
print("EXPLORATORY DATA ANALYSIS") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print(f"\nDataset Shape: {self.df.shape}") |
|
|
print(f"Date Range: {self.df.index.min()} to {self.df.index.max()}") |
|
|
print(f"Variables: {list(self.df.columns)}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 40) |
|
|
print("DESCRIPTIVE STATISTICS") |
|
|
print("=" * 40) |
|
|
desc_stats = self.df.describe() |
|
|
print(desc_stats) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 40) |
|
|
print("SKEWNESS AND KURTOSIS") |
|
|
print("=" * 40) |
|
|
skewness = self.df.skew() |
|
|
kurtosis = self.df.kurtosis() |
|
|
|
|
|
for col in self.df.columns: |
|
|
print(f"{col}:") |
|
|
print(f" Skewness: {skewness[col]:.3f}") |
|
|
print(f" Kurtosis: {kurtosis[col]:.3f}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 40) |
|
|
print("CORRELATION ANALYSIS") |
|
|
print("=" * 40) |
|
|
|
|
|
|
|
|
pearson_corr = self.df.corr(method="pearson") |
|
|
print("\nPearson Correlation Matrix:") |
|
|
print(pearson_corr.round(3)) |
|
|
|
|
|
|
|
|
spearman_corr = self.df.corr(method="spearman") |
|
|
print("\nSpearman Correlation Matrix:") |
|
|
print(spearman_corr.round(3)) |
|
|
|
|
|
|
|
|
self.results["eda"] = { |
|
|
"descriptive_stats": desc_stats, |
|
|
"skewness": skewness, |
|
|
"kurtosis": kurtosis, |
|
|
"pearson_corr": pearson_corr, |
|
|
"spearman_corr": spearman_corr, |
|
|
} |
|
|
|
|
|
return self.results["eda"] |
|
|
|
|
|
def perform_dimensionality_reduction(self, method="pca", n_components=2): |
|
|
"""Perform dimensionality reduction for visualization.""" |
|
|
print("\n" + "=" * 40) |
|
|
print(f"DIMENSIONALITY REDUCTION ({method.upper()})") |
|
|
print("=" * 40) |
|
|
|
|
|
|
|
|
df_clean = self.df.dropna() |
|
|
|
|
|
if method.lower() == "pca": |
|
|
|
|
|
pca = PCA(n_components=n_components) |
|
|
scaled_data = self.scaler.fit_transform(df_clean) |
|
|
pca_result = pca.fit_transform(scaled_data) |
|
|
|
|
|
print(f"Explained variance ratio: {pca.explained_variance_ratio_}") |
|
|
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.3f}") |
|
|
|
|
|
|
|
|
pca_df = pd.DataFrame( |
|
|
pca_result, |
|
|
columns=[f"PC{i+1}" for i in range(n_components)], |
|
|
index=df_clean.index, |
|
|
) |
|
|
|
|
|
self.results["pca"] = { |
|
|
"components": pca_df, |
|
|
"explained_variance": pca.explained_variance_ratio_, |
|
|
"feature_importance": pd.DataFrame( |
|
|
pca.components_.T, |
|
|
columns=[f"PC{i+1}" for i in range(n_components)], |
|
|
index=df_clean.columns, |
|
|
), |
|
|
} |
|
|
|
|
|
return self.results["pca"] |
|
|
|
|
|
return None |
|
|
|
|
|
def perform_statistical_modeling(self, target_var="GDP", test_size=0.2): |
|
|
"""Perform linear regression with comprehensive diagnostics.""" |
|
|
print("\n" + "=" * 40) |
|
|
print("STATISTICAL MODELING - LINEAR REGRESSION") |
|
|
print("=" * 40) |
|
|
|
|
|
|
|
|
df_clean = self.df.dropna() |
|
|
|
|
|
if target_var not in df_clean.columns: |
|
|
print(f"Target variable '{target_var}' not found in dataset") |
|
|
return None |
|
|
|
|
|
|
|
|
feature_cols = [col for col in df_clean.columns if col != target_var] |
|
|
X = df_clean[feature_cols] |
|
|
y = df_clean[target_var] |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=test_size, random_state=42 |
|
|
) |
|
|
|
|
|
|
|
|
model = LinearRegression() |
|
|
model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
y_pred_train = model.predict(X_train) |
|
|
y_pred_test = model.predict(X_test) |
|
|
|
|
|
|
|
|
r2_train = r2_score(y_train, y_pred_train) |
|
|
r2_test = r2_score(y_test, y_pred_test) |
|
|
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) |
|
|
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test)) |
|
|
|
|
|
print(f"\nModel Performance:") |
|
|
print(f"R² (Training): {r2_train:.4f}") |
|
|
print(f"R² (Test): {r2_test:.4f}") |
|
|
print(f"RMSE (Training): {rmse_train:.4f}") |
|
|
print(f"RMSE (Test): {rmse_test:.4f}") |
|
|
|
|
|
|
|
|
print(f"\nCoefficients:") |
|
|
for feature, coef in zip(feature_cols, model.coef_): |
|
|
print(f" {feature}: {coef:.4f}") |
|
|
print(f" Intercept: {model.intercept_:.4f}") |
|
|
|
|
|
|
|
|
X_with_const = sm.add_constant(X_train) |
|
|
model_sm = sm.OLS(y_train, X_with_const).fit() |
|
|
|
|
|
print(f"\nStatistical Significance:") |
|
|
print(model_sm.summary().tables[1]) |
|
|
|
|
|
|
|
|
print(f"\n" + "=" * 30) |
|
|
print("REGRESSION ASSUMPTIONS") |
|
|
print("=" * 30) |
|
|
|
|
|
|
|
|
residuals = y_train - y_pred_train |
|
|
_, p_value_norm = stats.normaltest(residuals) |
|
|
print(f"Normality test (p-value): {p_value_norm:.4f}") |
|
|
|
|
|
|
|
|
vif_data = [] |
|
|
for i in range(X_train.shape[1]): |
|
|
try: |
|
|
vif = variance_inflation_factor(X_train.values, i) |
|
|
vif_data.append(vif) |
|
|
except: |
|
|
vif_data.append(np.nan) |
|
|
|
|
|
print(f"\nVariance Inflation Factors:") |
|
|
for feature, vif in zip(feature_cols, vif_data): |
|
|
print(f" {feature}: {vif:.3f}") |
|
|
|
|
|
|
|
|
try: |
|
|
_, p_value_het = het_breuschpagan(residuals, X_with_const) |
|
|
print(f"\nHomoscedasticity test (p-value): {p_value_het:.4f}") |
|
|
except: |
|
|
p_value_het = np.nan |
|
|
print(f"\nHomoscedasticity test failed") |
|
|
|
|
|
|
|
|
self.results["regression"] = { |
|
|
"model": model, |
|
|
"model_sm": model_sm, |
|
|
"performance": { |
|
|
"r2_train": r2_train, |
|
|
"r2_test": r2_test, |
|
|
"rmse_train": rmse_train, |
|
|
"rmse_test": rmse_test, |
|
|
}, |
|
|
"coefficients": dict(zip(feature_cols, model.coef_)), |
|
|
"assumptions": { |
|
|
"normality_p": p_value_norm, |
|
|
"homoscedasticity_p": p_value_het, |
|
|
"vif": dict(zip(feature_cols, vif_data)), |
|
|
}, |
|
|
} |
|
|
|
|
|
return self.results["regression"] |
|
|
|
|
|
def perform_clustering(self, max_k=10): |
|
|
"""Perform clustering analysis with optimal k selection.""" |
|
|
print("\n" + "=" * 40) |
|
|
print("CLUSTERING ANALYSIS") |
|
|
print("=" * 40) |
|
|
|
|
|
|
|
|
df_clean = self.df.dropna() |
|
|
if df_clean.shape[0] < 10 or df_clean.shape[1] < 2: |
|
|
print( |
|
|
"Not enough data for clustering (need at least 10 rows and 2 columns after dropna). Skipping." |
|
|
) |
|
|
self.results["clustering"] = None |
|
|
return None |
|
|
try: |
|
|
scaled_data = self.scaler.fit_transform(df_clean) |
|
|
except Exception as e: |
|
|
print(f"Scaling failed: {e}") |
|
|
self.results["clustering"] = None |
|
|
return None |
|
|
|
|
|
inertias = [] |
|
|
silhouette_scores = [] |
|
|
k_range = range(2, min(max_k + 1, len(df_clean) // 10 + 1)) |
|
|
if len(k_range) < 2: |
|
|
print("Not enough data for multiple clusters. Skipping clustering.") |
|
|
self.results["clustering"] = None |
|
|
return None |
|
|
try: |
|
|
for k in k_range: |
|
|
kmeans = KMeans(n_clusters=k, random_state=42) |
|
|
kmeans.fit(scaled_data) |
|
|
inertias.append(kmeans.inertia_) |
|
|
silhouette_scores.append(silhouette_score(scaled_data, kmeans.labels_)) |
|
|
|
|
|
if inertias and silhouette_scores: |
|
|
plt.figure(figsize=(12, 4)) |
|
|
plt.subplot(1, 2, 1) |
|
|
plt.plot(list(k_range), inertias, "bo-") |
|
|
plt.xlabel("Number of Clusters (k)") |
|
|
plt.ylabel("Inertia") |
|
|
plt.title("Elbow Method") |
|
|
plt.grid(True) |
|
|
plt.subplot(1, 2, 2) |
|
|
plt.plot(list(k_range), silhouette_scores, "ro-") |
|
|
plt.xlabel("Number of Clusters (k)") |
|
|
plt.ylabel("Silhouette Score") |
|
|
plt.title("Silhouette Analysis") |
|
|
plt.grid(True) |
|
|
plt.tight_layout() |
|
|
plt.savefig( |
|
|
"data/exports/clustering_analysis.png", dpi=300, bbox_inches="tight" |
|
|
) |
|
|
plt.show() |
|
|
|
|
|
optimal_k = list(k_range)[np.argmax(silhouette_scores)] |
|
|
print(f"Optimal number of clusters: {optimal_k}") |
|
|
print(f"Best silhouette score: {max(silhouette_scores):.3f}") |
|
|
|
|
|
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42) |
|
|
cluster_labels = kmeans_optimal.fit_predict(scaled_data) |
|
|
|
|
|
df_clustered = df_clean.copy() |
|
|
df_clustered["Cluster"] = cluster_labels |
|
|
|
|
|
print(f"\nCluster Characteristics:") |
|
|
cluster_stats = df_clustered.groupby("Cluster").agg(["mean", "std"]) |
|
|
print(cluster_stats.round(3)) |
|
|
|
|
|
self.results["clustering"] = { |
|
|
"optimal_k": optimal_k, |
|
|
"silhouette_score": max(silhouette_scores), |
|
|
"cluster_labels": cluster_labels, |
|
|
"clustered_data": df_clustered, |
|
|
"cluster_stats": cluster_stats, |
|
|
"inertias": inertias, |
|
|
"silhouette_scores": silhouette_scores, |
|
|
} |
|
|
return self.results["clustering"] |
|
|
except Exception as e: |
|
|
print(f"Clustering failed: {e}") |
|
|
self.results["clustering"] = None |
|
|
return None |
|
|
|
|
|
def perform_time_series_analysis(self, target_var="GDP"): |
|
|
"""Perform comprehensive time series analysis.""" |
|
|
print("\n" + "=" * 40) |
|
|
print("TIME SERIES ANALYSIS") |
|
|
print("=" * 40) |
|
|
|
|
|
if target_var not in self.df.columns: |
|
|
print(f"Target variable '{target_var}' not found") |
|
|
self.results["time_series"] = None |
|
|
return None |
|
|
|
|
|
ts_data = self.df[target_var].dropna() |
|
|
if len(ts_data) < 50: |
|
|
print( |
|
|
"Insufficient data for time series analysis (need at least 50 points). Skipping." |
|
|
) |
|
|
self.results["time_series"] = None |
|
|
return None |
|
|
print(f"Time series length: {len(ts_data)} observations") |
|
|
print(f"Date range: {ts_data.index.min()} to {ts_data.index.max()}") |
|
|
|
|
|
print(f"\nTime Series Decomposition:") |
|
|
try: |
|
|
|
|
|
if ts_data.index.freq is None: |
|
|
ts_monthly = ts_data.resample("M").mean() |
|
|
else: |
|
|
ts_monthly = ts_data |
|
|
decomposition = seasonal_decompose(ts_monthly, model="additive", period=12) |
|
|
|
|
|
fig, axes = plt.subplots(4, 1, figsize=(12, 10)) |
|
|
decomposition.observed.plot(ax=axes[0], title="Original Time Series") |
|
|
decomposition.trend.plot(ax=axes[1], title="Trend") |
|
|
decomposition.seasonal.plot(ax=axes[2], title="Seasonality") |
|
|
decomposition.resid.plot(ax=axes[3], title="Residuals") |
|
|
plt.tight_layout() |
|
|
plt.savefig( |
|
|
"data/exports/time_series_decomposition.png", |
|
|
dpi=300, |
|
|
bbox_inches="tight", |
|
|
) |
|
|
plt.show() |
|
|
except Exception as e: |
|
|
print(f"Decomposition failed: {e}") |
|
|
|
|
|
print(f"\nARIMA Modeling:") |
|
|
try: |
|
|
|
|
|
model = ARIMA(ts_monthly, order=(1, 1, 1)) |
|
|
fitted_model = model.fit() |
|
|
print(f"ARIMA Model Summary:") |
|
|
print(fitted_model.summary()) |
|
|
|
|
|
forecast_steps = min(12, len(ts_monthly) // 4) |
|
|
forecast = fitted_model.forecast(steps=forecast_steps) |
|
|
conf_int = fitted_model.get_forecast(steps=forecast_steps).conf_int() |
|
|
|
|
|
plt.figure(figsize=(12, 6)) |
|
|
ts_monthly.plot(label="Historical Data") |
|
|
forecast.plot(label="Forecast", color="red") |
|
|
plt.fill_between( |
|
|
forecast.index, |
|
|
conf_int.iloc[:, 0], |
|
|
conf_int.iloc[:, 1], |
|
|
alpha=0.3, |
|
|
color="red", |
|
|
label="Confidence Interval", |
|
|
) |
|
|
plt.title(f"{target_var} - ARIMA Forecast") |
|
|
plt.legend() |
|
|
plt.grid(True) |
|
|
plt.tight_layout() |
|
|
plt.savefig( |
|
|
"data/exports/time_series_forecast.png", dpi=300, bbox_inches="tight" |
|
|
) |
|
|
plt.show() |
|
|
|
|
|
self.results["time_series"] = { |
|
|
"model": fitted_model, |
|
|
"forecast": forecast, |
|
|
"confidence_intervals": conf_int, |
|
|
"decomposition": decomposition if "decomposition" in locals() else None, |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"ARIMA modeling failed: {e}") |
|
|
self.results["time_series"] = None |
|
|
return self.results.get("time_series") |
|
|
|
|
|
def generate_insights_report(self): |
|
|
"""Generate comprehensive insights report in layman's terms.""" |
|
|
print("\n" + "=" * 60) |
|
|
print("COMPREHENSIVE INSIGHTS REPORT") |
|
|
print("=" * 60) |
|
|
|
|
|
insights = [] |
|
|
|
|
|
if "eda" in self.results and self.results["eda"] is not None: |
|
|
insights.append("EXPLORATORY DATA ANALYSIS INSIGHTS:") |
|
|
insights.append("-" * 40) |
|
|
|
|
|
pearson_corr = self.results["eda"]["pearson_corr"] |
|
|
high_corr_pairs = [] |
|
|
for i in range(len(pearson_corr.columns)): |
|
|
for j in range(i + 1, len(pearson_corr.columns)): |
|
|
corr_val = pearson_corr.iloc[i, j] |
|
|
if abs(corr_val) > 0.7: |
|
|
high_corr_pairs.append( |
|
|
(pearson_corr.columns[i], pearson_corr.columns[j], corr_val) |
|
|
) |
|
|
if high_corr_pairs: |
|
|
insights.append("Strong correlations found:") |
|
|
for var1, var2, corr in high_corr_pairs: |
|
|
insights.append(f" • {var1} and {var2}: {corr:.3f}") |
|
|
else: |
|
|
insights.append( |
|
|
"No strong correlations (>0.7) found between variables." |
|
|
) |
|
|
else: |
|
|
insights.append("EDA could not be performed or returned no results.") |
|
|
|
|
|
if "regression" in self.results and self.results["regression"] is not None: |
|
|
insights.append("\nREGRESSION MODEL INSIGHTS:") |
|
|
insights.append("-" * 40) |
|
|
reg_results = self.results["regression"] |
|
|
r2_test = reg_results["performance"]["r2_test"] |
|
|
insights.append(f"Model Performance:") |
|
|
insights.append( |
|
|
f" • The model explains {r2_test:.1%} of the variation in the target variable" |
|
|
) |
|
|
if r2_test > 0.7: |
|
|
insights.append(" • This is considered a good model fit") |
|
|
elif r2_test > 0.5: |
|
|
insights.append(" • This is considered a moderate model fit") |
|
|
else: |
|
|
insights.append(" • This model has limited predictive power") |
|
|
|
|
|
assumptions = reg_results["assumptions"] |
|
|
if assumptions["normality_p"] > 0.05: |
|
|
insights.append( |
|
|
" • Residuals are normally distributed (assumption met)" |
|
|
) |
|
|
else: |
|
|
insights.append( |
|
|
" • Residuals are not normally distributed (assumption violated)" |
|
|
) |
|
|
else: |
|
|
insights.append( |
|
|
"Regression modeling could not be performed or returned no results." |
|
|
) |
|
|
|
|
|
if "clustering" in self.results and self.results["clustering"] is not None: |
|
|
insights.append("\nCLUSTERING INSIGHTS:") |
|
|
insights.append("-" * 40) |
|
|
cluster_results = self.results["clustering"] |
|
|
optimal_k = cluster_results["optimal_k"] |
|
|
silhouette_score = cluster_results["silhouette_score"] |
|
|
insights.append(f"Optimal number of clusters: {optimal_k}") |
|
|
insights.append(f"Cluster quality score: {silhouette_score:.3f}") |
|
|
if silhouette_score > 0.5: |
|
|
insights.append(" • Clusters are well-separated and distinct") |
|
|
elif silhouette_score > 0.3: |
|
|
insights.append(" • Clusters show moderate separation") |
|
|
else: |
|
|
insights.append(" • Clusters may not be well-defined") |
|
|
else: |
|
|
insights.append("Clustering could not be performed or returned no results.") |
|
|
|
|
|
if "time_series" in self.results and self.results["time_series"] is not None: |
|
|
insights.append("\nTIME SERIES INSIGHTS:") |
|
|
insights.append("-" * 40) |
|
|
insights.append( |
|
|
" • Time series decomposition shows trend, seasonality, and random components" |
|
|
) |
|
|
insights.append( |
|
|
" • ARIMA model provides future forecasts with confidence intervals" |
|
|
) |
|
|
insights.append( |
|
|
" • Forecasts can be used for planning and decision-making" |
|
|
) |
|
|
else: |
|
|
insights.append( |
|
|
"Time series analysis could not be performed or returned no results." |
|
|
) |
|
|
|
|
|
for insight in insights: |
|
|
print(insight) |
|
|
|
|
|
with open("data/exports/insights_report.txt", "w") as f: |
|
|
f.write("\n".join(insights)) |
|
|
return insights |
|
|
|
|
|
def run_complete_analysis(self): |
|
|
"""Run the complete advanced analytics workflow.""" |
|
|
print("Starting comprehensive advanced analytics...") |
|
|
|
|
|
|
|
|
self.perform_eda() |
|
|
|
|
|
|
|
|
self.perform_dimensionality_reduction() |
|
|
|
|
|
|
|
|
self.perform_statistical_modeling() |
|
|
|
|
|
|
|
|
self.perform_clustering() |
|
|
|
|
|
|
|
|
self.perform_time_series_analysis() |
|
|
|
|
|
|
|
|
self.generate_insights_report() |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("ANALYSIS COMPLETE!") |
|
|
print("=" * 60) |
|
|
print("Check the following outputs:") |
|
|
print(" • data/exports/insights_report.txt - Comprehensive insights") |
|
|
print(" • data/exports/clustering_analysis.png - Clustering results") |
|
|
print( |
|
|
" • data/exports/time_series_decomposition.png - Time series decomposition" |
|
|
) |
|
|
print(" • data/exports/time_series_forecast.png - Time series forecast") |
|
|
|
|
|
return self.results |
|
|
|