| | |
| | """ |
| | Run EDA: Distributions, skewness, kurtosis, correlations, PCA/t-SNE |
| | """ |
| | import os |
| | import sys |
| | import glob |
| | sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src')) |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from sklearn.decomposition import PCA |
| | from sklearn.preprocessing import StandardScaler |
| |
|
| | |
| | def find_latest_data(): |
| | data_files = glob.glob('data/processed/fred_data_*.csv') |
| | if not data_files: |
| | raise FileNotFoundError("No FRED data files found. Run the pipeline first.") |
| | return max(data_files, key=os.path.getctime) |
| |
|
| | def main(): |
| | print("="*60) |
| | print("FRED EDA: Distributions, Skewness, Kurtosis, Correlations, PCA") |
| | print("="*60) |
| | data_file = find_latest_data() |
| | print(f"Using data file: {data_file}") |
| | df = pd.read_csv(data_file, index_col=0, parse_dates=True) |
| | df_clean = df.dropna() |
| | |
| | desc = df.describe() |
| | skew = df.skew() |
| | kurt = df.kurtosis() |
| | print("\nDescriptive Statistics:\n", desc) |
| | print("\nSkewness:") |
| | print(skew) |
| | print("\nKurtosis:") |
| | print(kurt) |
| | |
| | for col in df.columns: |
| | plt.figure(figsize=(8,4)) |
| | sns.histplot(df[col].dropna(), kde=True) |
| | plt.title(f"Distribution of {col}") |
| | plt.savefig(f"data/exports/distribution_{col}.png", dpi=200, bbox_inches='tight') |
| | plt.close() |
| | |
| | pearson_corr = df.corr(method='pearson') |
| | spearman_corr = df.corr(method='spearman') |
| | print("\nPearson Correlation Matrix:\n", pearson_corr.round(3)) |
| | print("\nSpearman Correlation Matrix:\n", spearman_corr.round(3)) |
| | plt.figure(figsize=(8,6)) |
| | sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', center=0) |
| | plt.title('Pearson Correlation Matrix') |
| | plt.tight_layout() |
| | plt.savefig('data/exports/pearson_corr_matrix.png', dpi=200) |
| | plt.close() |
| | plt.figure(figsize=(8,6)) |
| | sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', center=0) |
| | plt.title('Spearman Correlation Matrix') |
| | plt.tight_layout() |
| | plt.savefig('data/exports/spearman_corr_matrix.png', dpi=200) |
| | plt.close() |
| | |
| | scaler = StandardScaler() |
| | scaled = scaler.fit_transform(df_clean) |
| | pca = PCA(n_components=2) |
| | pca_result = pca.fit_transform(scaled) |
| | pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'], index=df_clean.index) |
| | plt.figure(figsize=(8,6)) |
| | plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.5) |
| | plt.xlabel('PC1') |
| | plt.ylabel('PC2') |
| | plt.title('PCA Projection (2D)') |
| | plt.tight_layout() |
| | plt.savefig('data/exports/pca_2d.png', dpi=200) |
| | plt.close() |
| | print("\nEDA complete. Outputs saved to data/exports/.") |
| |
|
| | if __name__ == "__main__": |
| | main() |