File size: 8,421 Bytes

5da71f2

#!/usr/bin/env python
# coding: utf-8
"""

Feature Correlation Analysis

Helps identify redundant features and features most correlated with Target.

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# =============================================================================
# 1. LOAD DATA
# =============================================================================

df = pd.read_csv('data.csv', sep=';')
df = df[df['Target'] != 'Enrolled']
df['Target'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})

print(f"Dataset shape: {df.shape}")
print(f"Features: {df.shape[1] - 1}")

# =============================================================================
# 2. CORRELATION WITH TARGET
# =============================================================================

print("\n" + "="*70)
print("CORRELATION WITH TARGET (Dropout=0, Graduate=1)")
print("="*70)

# Calculate correlation with target
target_corr = df.corr()['Target'].drop('Target').sort_values(key=abs, ascending=False)

print("\nAll features ranked by absolute correlation with Target:\n")
for i, (feature, corr) in enumerate(target_corr.items(), 1):
    strength = "STRONG" if abs(corr) > 0.3 else "MODERATE" if abs(corr) > 0.15 else "WEAK"
    print(f"{i:2d}. {feature:50s} {corr:+.4f}  [{strength}]")

# Plot correlation with target
plt.figure(figsize=(12, 10))
colors = ['green' if c > 0 else 'red' for c in target_corr.values]
target_corr.plot(kind='barh', color=colors)
plt.title('Feature Correlation with Target (Graduate=1)')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linewidth=0.5)
plt.axvline(x=0.3, color='blue', linestyle='--', alpha=0.5, label='Strong threshold')
plt.axvline(x=-0.3, color='blue', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('correlation_with_target.png', dpi=150)
plt.show()

# =============================================================================
# 3. FEATURE-TO-FEATURE CORRELATION (Find Redundant Features)
# =============================================================================

print("\n" + "="*70)
print("HIGHLY CORRELATED FEATURE PAIRS (Potential Redundancy)")
print("="*70)

# Calculate correlation matrix
corr_matrix = df.drop('Target', axis=1).corr()

# Find highly correlated pairs
high_corr_pairs = []
threshold = 0.7

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_value = corr_matrix.iloc[i, j]
        if abs(corr_value) >= threshold:
            high_corr_pairs.append({
                'Feature 1': corr_matrix.columns[i],
                'Feature 2': corr_matrix.columns[j],
                'Correlation': corr_value
            })

high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', key=abs, ascending=False)

print(f"\nFeature pairs with correlation >= {threshold}:\n")
if len(high_corr_df) > 0:
    for _, row in high_corr_df.iterrows():
        print(f"  {row['Correlation']:+.4f}  |  {row['Feature 1']}")
        print(f"           |  {row['Feature 2']}")
        print()
else:
    print("  No highly correlated pairs found.")

# =============================================================================
# 4. CORRELATION HEATMAP
# =============================================================================

plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5,
            annot_kws={'size': 6})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=150)
plt.show()

# =============================================================================
# 5. RECOMMENDATIONS FOR FEATURE SELECTION
# =============================================================================

print("\n" + "="*70)
print("FEATURE SELECTION RECOMMENDATIONS")
print("="*70)

# Weak correlation with target (candidates for removal)
weak_threshold = 0.05
weak_features = target_corr[abs(target_corr) < weak_threshold]

print(f"\n1. WEAK CORRELATION WITH TARGET (|corr| < {weak_threshold}):")
print("   Consider removing these - they may not help prediction:\n")
for feature, corr in weak_features.items():
    print(f"   - {feature}: {corr:+.4f}")

# Features to keep (strong correlation)
strong_threshold = 0.2
strong_features = target_corr[abs(target_corr) >= strong_threshold]

print(f"\n2. STRONG CORRELATION WITH TARGET (|corr| >= {strong_threshold}):")
print("   Keep these - they are predictive:\n")
for feature, corr in strong_features.items():
    print(f"   + {feature}: {corr:+.4f}")

# Redundant features (high correlation with each other)
print(f"\n3. REDUNDANT FEATURES (correlated with each other >= {threshold}):")
print("   Consider keeping only one from each pair:\n")
for _, row in high_corr_df.iterrows():
    # Suggest keeping the one more correlated with target
    corr1 = abs(target_corr.get(row['Feature 1'], 0))
    corr2 = abs(target_corr.get(row['Feature 2'], 0))
    keep = row['Feature 1'] if corr1 >= corr2 else row['Feature 2']
    drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
    print(f"   KEEP: {keep} (target corr: {target_corr.get(keep, 0):+.4f})")
    print(f"   DROP: {drop} (target corr: {target_corr.get(drop, 0):+.4f})")
    print()

# =============================================================================
# 6. SUGGESTED FEATURES TO DROP
# =============================================================================

print("\n" + "="*70)
print("SUGGESTED FEATURES TO DROP")
print("="*70)

features_to_drop = set()

# Add weak features
for f in weak_features.index:
    features_to_drop.add(f)

# Add redundant features (the one less correlated with target)
for _, row in high_corr_df.iterrows():
    corr1 = abs(target_corr.get(row['Feature 1'], 0))
    corr2 = abs(target_corr.get(row['Feature 2'], 0))
    drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
    features_to_drop.add(drop)

print(f"\nBased on analysis, consider dropping these {len(features_to_drop)} features:\n")
for f in features_to_drop:
    reason = []
    if f in weak_features.index:
        reason.append(f"weak target corr ({target_corr[f]:+.4f})")
    if f in [row['Feature 1'] for _, row in high_corr_df.iterrows()] or \
       f in [row['Feature 2'] for _, row in high_corr_df.iterrows()]:
        reason.append("redundant with another feature")
    print(f"  - {f}")
    print(f"    Reason: {', '.join(reason)}")

# Features to keep
features_to_keep = [f for f in target_corr.index if f not in features_to_drop]

print(f"\nKeep these {len(features_to_keep)} features:\n")
for f in features_to_keep:
    print(f"  + {f} (target corr: {target_corr[f]:+.4f})")

# =============================================================================
# 7. GENERATE CODE SNIPPET
# =============================================================================

print("\n" + "="*70)
print("CODE SNIPPET FOR YOUR TRAINING SCRIPT")
print("="*70)

print("\n# Copy this to your training script:")
print(f"columns_to_drop = {list(features_to_drop)}")

# =============================================================================
# 8. SAVE ANALYSIS RESULTS
# =============================================================================

# Save correlation with target
target_corr.to_csv('target_correlations.csv', header=['correlation'])

# Save high correlation pairs
if len(high_corr_df) > 0:
    high_corr_df.to_csv('redundant_feature_pairs.csv', index=False)

# Save recommendations
with open('feature_selection_recommendations.txt', 'w') as f:
    f.write("FEATURE SELECTION RECOMMENDATIONS\n")
    f.write("="*50 + "\n\n")
    f.write(f"Features to DROP ({len(features_to_drop)}):\n")
    for feat in features_to_drop:
        f.write(f"  - {feat}\n")
    f.write(f"\nFeatures to KEEP ({len(features_to_keep)}):\n")
    for feat in features_to_keep:
        f.write(f"  + {feat}\n")

print("\nFiles saved:")
print("  1. correlation_with_target.png")
print("  2. correlation_matrix.png")
print("  3. target_correlations.csv")
print("  4. redundant_feature_pairs.csv")
print("  5. feature_selection_recommendations.txt")