sahlnizar's picture
Add files using upload-large-folder tool
5da71f2 verified
#!/usr/bin/env python
# coding: utf-8
"""
Feature Correlation Analysis
Helps identify redundant features and features most correlated with Target.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# =============================================================================
# 1. LOAD DATA
# =============================================================================
df = pd.read_csv('data.csv', sep=';')
df = df[df['Target'] != 'Enrolled']
df['Target'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})
print(f"Dataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")
# =============================================================================
# 2. CORRELATION WITH TARGET
# =============================================================================
print("\n" + "="*70)
print("CORRELATION WITH TARGET (Dropout=0, Graduate=1)")
print("="*70)
# Calculate correlation with target
target_corr = df.corr()['Target'].drop('Target').sort_values(key=abs, ascending=False)
print("\nAll features ranked by absolute correlation with Target:\n")
for i, (feature, corr) in enumerate(target_corr.items(), 1):
strength = "STRONG" if abs(corr) > 0.3 else "MODERATE" if abs(corr) > 0.15 else "WEAK"
print(f"{i:2d}. {feature:50s} {corr:+.4f} [{strength}]")
# Plot correlation with target
plt.figure(figsize=(12, 10))
colors = ['green' if c > 0 else 'red' for c in target_corr.values]
target_corr.plot(kind='barh', color=colors)
plt.title('Feature Correlation with Target (Graduate=1)')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linewidth=0.5)
plt.axvline(x=0.3, color='blue', linestyle='--', alpha=0.5, label='Strong threshold')
plt.axvline(x=-0.3, color='blue', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('correlation_with_target.png', dpi=150)
plt.show()
# =============================================================================
# 3. FEATURE-TO-FEATURE CORRELATION (Find Redundant Features)
# =============================================================================
print("\n" + "="*70)
print("HIGHLY CORRELATED FEATURE PAIRS (Potential Redundancy)")
print("="*70)
# Calculate correlation matrix
corr_matrix = df.drop('Target', axis=1).corr()
# Find highly correlated pairs
high_corr_pairs = []
threshold = 0.7
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_value = corr_matrix.iloc[i, j]
if abs(corr_value) >= threshold:
high_corr_pairs.append({
'Feature 1': corr_matrix.columns[i],
'Feature 2': corr_matrix.columns[j],
'Correlation': corr_value
})
high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', key=abs, ascending=False)
print(f"\nFeature pairs with correlation >= {threshold}:\n")
if len(high_corr_df) > 0:
for _, row in high_corr_df.iterrows():
print(f" {row['Correlation']:+.4f} | {row['Feature 1']}")
print(f" | {row['Feature 2']}")
print()
else:
print(" No highly correlated pairs found.")
# =============================================================================
# 4. CORRELATION HEATMAP
# =============================================================================
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, linewidths=0.5,
annot_kws={'size': 6})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=150)
plt.show()
# =============================================================================
# 5. RECOMMENDATIONS FOR FEATURE SELECTION
# =============================================================================
print("\n" + "="*70)
print("FEATURE SELECTION RECOMMENDATIONS")
print("="*70)
# Weak correlation with target (candidates for removal)
weak_threshold = 0.05
weak_features = target_corr[abs(target_corr) < weak_threshold]
print(f"\n1. WEAK CORRELATION WITH TARGET (|corr| < {weak_threshold}):")
print(" Consider removing these - they may not help prediction:\n")
for feature, corr in weak_features.items():
print(f" - {feature}: {corr:+.4f}")
# Features to keep (strong correlation)
strong_threshold = 0.2
strong_features = target_corr[abs(target_corr) >= strong_threshold]
print(f"\n2. STRONG CORRELATION WITH TARGET (|corr| >= {strong_threshold}):")
print(" Keep these - they are predictive:\n")
for feature, corr in strong_features.items():
print(f" + {feature}: {corr:+.4f}")
# Redundant features (high correlation with each other)
print(f"\n3. REDUNDANT FEATURES (correlated with each other >= {threshold}):")
print(" Consider keeping only one from each pair:\n")
for _, row in high_corr_df.iterrows():
# Suggest keeping the one more correlated with target
corr1 = abs(target_corr.get(row['Feature 1'], 0))
corr2 = abs(target_corr.get(row['Feature 2'], 0))
keep = row['Feature 1'] if corr1 >= corr2 else row['Feature 2']
drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
print(f" KEEP: {keep} (target corr: {target_corr.get(keep, 0):+.4f})")
print(f" DROP: {drop} (target corr: {target_corr.get(drop, 0):+.4f})")
print()
# =============================================================================
# 6. SUGGESTED FEATURES TO DROP
# =============================================================================
print("\n" + "="*70)
print("SUGGESTED FEATURES TO DROP")
print("="*70)
features_to_drop = set()
# Add weak features
for f in weak_features.index:
features_to_drop.add(f)
# Add redundant features (the one less correlated with target)
for _, row in high_corr_df.iterrows():
corr1 = abs(target_corr.get(row['Feature 1'], 0))
corr2 = abs(target_corr.get(row['Feature 2'], 0))
drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
features_to_drop.add(drop)
print(f"\nBased on analysis, consider dropping these {len(features_to_drop)} features:\n")
for f in features_to_drop:
reason = []
if f in weak_features.index:
reason.append(f"weak target corr ({target_corr[f]:+.4f})")
if f in [row['Feature 1'] for _, row in high_corr_df.iterrows()] or \
f in [row['Feature 2'] for _, row in high_corr_df.iterrows()]:
reason.append("redundant with another feature")
print(f" - {f}")
print(f" Reason: {', '.join(reason)}")
# Features to keep
features_to_keep = [f for f in target_corr.index if f not in features_to_drop]
print(f"\nKeep these {len(features_to_keep)} features:\n")
for f in features_to_keep:
print(f" + {f} (target corr: {target_corr[f]:+.4f})")
# =============================================================================
# 7. GENERATE CODE SNIPPET
# =============================================================================
print("\n" + "="*70)
print("CODE SNIPPET FOR YOUR TRAINING SCRIPT")
print("="*70)
print("\n# Copy this to your training script:")
print(f"columns_to_drop = {list(features_to_drop)}")
# =============================================================================
# 8. SAVE ANALYSIS RESULTS
# =============================================================================
# Save correlation with target
target_corr.to_csv('target_correlations.csv', header=['correlation'])
# Save high correlation pairs
if len(high_corr_df) > 0:
high_corr_df.to_csv('redundant_feature_pairs.csv', index=False)
# Save recommendations
with open('feature_selection_recommendations.txt', 'w') as f:
f.write("FEATURE SELECTION RECOMMENDATIONS\n")
f.write("="*50 + "\n\n")
f.write(f"Features to DROP ({len(features_to_drop)}):\n")
for feat in features_to_drop:
f.write(f" - {feat}\n")
f.write(f"\nFeatures to KEEP ({len(features_to_keep)}):\n")
for feat in features_to_keep:
f.write(f" + {feat}\n")
print("\nFiles saved:")
print(" 1. correlation_with_target.png")
print(" 2. correlation_matrix.png")
print(" 3. target_correlations.csv")
print(" 4. redundant_feature_pairs.csv")
print(" 5. feature_selection_recommendations.txt")