Add files using upload-large-folder tool

5da71f2 verified about 1 month ago

8.42 kB

	#!/usr/bin/env python
	# coding: utf-8
	"""
	Feature Correlation Analysis
	Helps identify redundant features and features most correlated with Target.
	"""

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns

	# =============================================================================
	# 1. LOAD DATA
	# =============================================================================

	df = pd.read_csv('data.csv', sep=';')
	df = df[df['Target'] != 'Enrolled']
	df['Target'] = df['Target'].map({'Dropout': 0, 'Graduate': 1})

	print(f"Dataset shape: {df.shape}")
	print(f"Features: {df.shape[1] - 1}")

	# =============================================================================
	# 2. CORRELATION WITH TARGET
	# =============================================================================

	print("\n" + "="*70)
	print("CORRELATION WITH TARGET (Dropout=0, Graduate=1)")
	print("="*70)

	# Calculate correlation with target
	target_corr = df.corr()['Target'].drop('Target').sort_values(key=abs, ascending=False)

	print("\nAll features ranked by absolute correlation with Target:\n")
	for i, (feature, corr) in enumerate(target_corr.items(), 1):
	strength = "STRONG" if abs(corr) > 0.3 else "MODERATE" if abs(corr) > 0.15 else "WEAK"
	print(f"{i:2d}. {feature:50s} {corr:+.4f} [{strength}]")

	# Plot correlation with target
	plt.figure(figsize=(12, 10))
	colors = ['green' if c > 0 else 'red' for c in target_corr.values]
	target_corr.plot(kind='barh', color=colors)
	plt.title('Feature Correlation with Target (Graduate=1)')
	plt.xlabel('Correlation Coefficient')
	plt.axvline(x=0, color='black', linewidth=0.5)
	plt.axvline(x=0.3, color='blue', linestyle='--', alpha=0.5, label='Strong threshold')
	plt.axvline(x=-0.3, color='blue', linestyle='--', alpha=0.5)
	plt.tight_layout()
	plt.savefig('correlation_with_target.png', dpi=150)
	plt.show()

	# =============================================================================
	# 3. FEATURE-TO-FEATURE CORRELATION (Find Redundant Features)
	# =============================================================================

	print("\n" + "="*70)
	print("HIGHLY CORRELATED FEATURE PAIRS (Potential Redundancy)")
	print("="*70)

	# Calculate correlation matrix
	corr_matrix = df.drop('Target', axis=1).corr()

	# Find highly correlated pairs
	high_corr_pairs = []
	threshold = 0.7

	for i in range(len(corr_matrix.columns)):
	for j in range(i+1, len(corr_matrix.columns)):
	corr_value = corr_matrix.iloc[i, j]
	if abs(corr_value) >= threshold:
	high_corr_pairs.append({
	'Feature 1': corr_matrix.columns[i],
	'Feature 2': corr_matrix.columns[j],
	'Correlation': corr_value
	})

	high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', key=abs, ascending=False)

	print(f"\nFeature pairs with correlation >= {threshold}:\n")
	if len(high_corr_df) > 0:
	for _, row in high_corr_df.iterrows():
	print(f" {row['Correlation']:+.4f} \| {row['Feature 1']}")
	print(f" \| {row['Feature 2']}")
	print()
	else:
	print(" No highly correlated pairs found.")

	# =============================================================================
	# 4. CORRELATION HEATMAP
	# =============================================================================

	plt.figure(figsize=(20, 16))
	sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
	center=0, square=True, linewidths=0.5,
	annot_kws={'size': 6})
	plt.title('Feature Correlation Matrix')
	plt.tight_layout()
	plt.savefig('correlation_matrix.png', dpi=150)
	plt.show()

	# =============================================================================
	# 5. RECOMMENDATIONS FOR FEATURE SELECTION
	# =============================================================================

	print("\n" + "="*70)
	print("FEATURE SELECTION RECOMMENDATIONS")
	print("="*70)

	# Weak correlation with target (candidates for removal)
	weak_threshold = 0.05
	weak_features = target_corr[abs(target_corr) < weak_threshold]

	print(f"\n1. WEAK CORRELATION WITH TARGET (\|corr\| < {weak_threshold}):")
	print(" Consider removing these - they may not help prediction:\n")
	for feature, corr in weak_features.items():
	print(f" - {feature}: {corr:+.4f}")

	# Features to keep (strong correlation)
	strong_threshold = 0.2
	strong_features = target_corr[abs(target_corr) >= strong_threshold]

	print(f"\n2. STRONG CORRELATION WITH TARGET (\|corr\| >= {strong_threshold}):")
	print(" Keep these - they are predictive:\n")
	for feature, corr in strong_features.items():
	print(f" + {feature}: {corr:+.4f}")

	# Redundant features (high correlation with each other)
	print(f"\n3. REDUNDANT FEATURES (correlated with each other >= {threshold}):")
	print(" Consider keeping only one from each pair:\n")
	for _, row in high_corr_df.iterrows():
	# Suggest keeping the one more correlated with target
	corr1 = abs(target_corr.get(row['Feature 1'], 0))
	corr2 = abs(target_corr.get(row['Feature 2'], 0))
	keep = row['Feature 1'] if corr1 >= corr2 else row['Feature 2']
	drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
	print(f" KEEP: {keep} (target corr: {target_corr.get(keep, 0):+.4f})")
	print(f" DROP: {drop} (target corr: {target_corr.get(drop, 0):+.4f})")
	print()

	# =============================================================================
	# 6. SUGGESTED FEATURES TO DROP
	# =============================================================================

	print("\n" + "="*70)
	print("SUGGESTED FEATURES TO DROP")
	print("="*70)

	features_to_drop = set()

	# Add weak features
	for f in weak_features.index:
	features_to_drop.add(f)

	# Add redundant features (the one less correlated with target)
	for _, row in high_corr_df.iterrows():
	corr1 = abs(target_corr.get(row['Feature 1'], 0))
	corr2 = abs(target_corr.get(row['Feature 2'], 0))
	drop = row['Feature 2'] if corr1 >= corr2 else row['Feature 1']
	features_to_drop.add(drop)

	print(f"\nBased on analysis, consider dropping these {len(features_to_drop)} features:\n")
	for f in features_to_drop:
	reason = []
	if f in weak_features.index:
	reason.append(f"weak target corr ({target_corr[f]:+.4f})")
	if f in [row['Feature 1'] for _, row in high_corr_df.iterrows()] or \
	f in [row['Feature 2'] for _, row in high_corr_df.iterrows()]:
	reason.append("redundant with another feature")
	print(f" - {f}")
	print(f" Reason: {', '.join(reason)}")

	# Features to keep
	features_to_keep = [f for f in target_corr.index if f not in features_to_drop]

	print(f"\nKeep these {len(features_to_keep)} features:\n")
	for f in features_to_keep:
	print(f" + {f} (target corr: {target_corr[f]:+.4f})")

	# =============================================================================
	# 7. GENERATE CODE SNIPPET
	# =============================================================================

	print("\n" + "="*70)
	print("CODE SNIPPET FOR YOUR TRAINING SCRIPT")
	print("="*70)

	print("\n# Copy this to your training script:")
	print(f"columns_to_drop = {list(features_to_drop)}")

	# =============================================================================
	# 8. SAVE ANALYSIS RESULTS
	# =============================================================================

	# Save correlation with target
	target_corr.to_csv('target_correlations.csv', header=['correlation'])

	# Save high correlation pairs
	if len(high_corr_df) > 0:
	high_corr_df.to_csv('redundant_feature_pairs.csv', index=False)

	# Save recommendations
	with open('feature_selection_recommendations.txt', 'w') as f:
	f.write("FEATURE SELECTION RECOMMENDATIONS\n")
	f.write("="*50 + "\n\n")
	f.write(f"Features to DROP ({len(features_to_drop)}):\n")
	for feat in features_to_drop:
	f.write(f" - {feat}\n")
	f.write(f"\nFeatures to KEEP ({len(features_to_keep)}):\n")
	for feat in features_to_keep:
	f.write(f" + {feat}\n")

	print("\nFiles saved:")
	print(" 1. correlation_with_target.png")
	print(" 2. correlation_matrix.png")
	print(" 3. target_correlations.csv")
	print(" 4. redundant_feature_pairs.csv")
	print(" 5. feature_selection_recommendations.txt")