TimeFlowPro

Running

App Files Files Community

TimeFlowPro / correlations /correlation_analyzer.py

ArabovMK

Update all files

d8f69a9 8 days ago

raw

history blame contribute delete

28 kB

	# ============================================
	# CLASS 8: CORRELATION AND MULTICOLLINEARITY ANALYSIS
	# ============================================
	import os
	import traceback
	from typing import Any, Dict, List, Optional
	from venv import logger

	from config.config import Config
	import numpy as np
	import pandas as pd

	class CorrelationAnalyzer:
	"""Class for comprehensive correlation and multicollinearity analysis"""

	def __init__(self, config: Config):
	"""
	Initialise the analyser

	Parameters:
	-----------
	config : Config
	Experiment configuration
	"""
	self.config = config
	self.correlation_matrices = {}
	self.high_correlation_pairs = {}
	self.multicollinearity_info = {}
	self.vif_scores = {}

	def analyze(
	self,
	data: pd.DataFrame,
	target_col: Optional[str] = None,
	threshold: float = 0.8,
	detailed: bool = True,
	**kwargs
	) -> pd.DataFrame:
	"""
	Analyse correlations in the data

	Parameters:
	-----------
	data : pd.DataFrame
	Input data
	target_col : str, optional
	Target variable
	threshold : float
	Threshold for identifying high correlations
	detailed : bool
	Whether to perform detailed analysis
	**kwargs : dict
	Additional parameters

	Returns:
	--------
	pd.DataFrame
	Correlation matrix
	"""
	logger.info("\n" + "="*80)
	logger.info("CORRELATION AND MULTICOLLINEARITY ANALYSIS")
	logger.info("="*80)

	target_col = target_col or self.config.target_column

	try:
	# 1. Calculate correlation matrix
	corr_matrix = self._compute_correlations(data, target_col)

	if corr_matrix.empty:
	logger.warning("Correlation matrix is empty")
	return pd.DataFrame()

	# 2. Identify high correlations
	high_correlations = self._detect_high_correlations(corr_matrix, threshold)
	self.high_correlation_pairs['pearson'] = high_correlations

	# 3. Analyse correlations with target variable
	target_correlations = []
	if target_col in corr_matrix.columns:
	target_correlations = self._get_target_correlations(corr_matrix, target_col)

	# 4. Analyse multicollinearity (VIF)
	vif_results = self._compute_vif_scores(data)

	# 5. Detailed analysis if required
	if detailed:
	self._detailed_correlation_analysis(data, corr_matrix, target_col)

	# 6. Visualisation
	if self.config.save_plots:
	self._plot_correlation_analysis(data, corr_matrix, target_col, high_correlations, vif_results)

	# 7. Output results
	self._log_analysis_results(corr_matrix, high_correlations, target_correlations, vif_results)

	return corr_matrix

	except Exception as e:
	logger.error(f"Error in correlation analysis: {e}")
	logger.error(traceback.format_exc())
	return pd.DataFrame()

	def _compute_correlations(
	self,
	data: pd.DataFrame,
	target_col: str
	) -> pd.DataFrame:
	"""Calculate correlation matrix"""
	logger.info("Calculating correlation matrix...")

	# Select only numeric columns
	numeric_data = data.select_dtypes(include=[np.number])

	# Remove constant columns
	numeric_data = numeric_data.loc[:, numeric_data.nunique() > 1]

	if numeric_data.shape[1] < 2:
	logger.warning("Insufficient numeric features for analysis")
	return pd.DataFrame()

	# Remove missing values
	numeric_data_clean = numeric_data.dropna()

	if len(numeric_data_clean) < 10:
	logger.warning("Insufficient data after cleaning")
	return pd.DataFrame()

	# Calculate Pearson correlation
	try:
	corr_matrix = numeric_data_clean.corr(method='pearson')
	self.correlation_matrices['pearson'] = corr_matrix
	logger.info(f"✓ Correlation matrix calculated: {corr_matrix.shape}")
	return corr_matrix
	except Exception as e:
	logger.error(f"Error calculating correlation: {e}")
	return pd.DataFrame()

	def _detect_high_correlations(
	self,
	corr_matrix: pd.DataFrame,
	threshold: float = 0.8
	) -> List[Dict[str, Any]]:
	"""Detect high correlations"""
	high_correlations = []

	if corr_matrix.empty:
	return high_correlations

	# Use upper triangle of matrix
	upper_triangle = corr_matrix.where(
	np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
	)

	# Find pairs with correlation above threshold
	for col in upper_triangle.columns:
	if col in upper_triangle:
	high_corr_series = upper_triangle[col][abs(upper_triangle[col]) > threshold]

	for row_idx, correlation in high_corr_series.items():
	if not pd.isna(correlation):
	high_correlations.append({
	'feature1': row_idx,
	'feature2': col,
	'correlation': float(correlation),
	'abs_correlation': abs(float(correlation))
	})

	# Sort by absolute correlation value
	high_correlations.sort(key=lambda x: x['abs_correlation'], reverse=True)

	logger.info(f"High correlations detected (> {threshold}): {len(high_correlations)}")
	return high_correlations

	def _get_target_correlations(
	self,
	corr_matrix: pd.DataFrame,
	target_col: str
	) -> List[Dict[str, Any]]:
	"""Get correlations with target variable"""
	target_correlations = []

	if target_col not in corr_matrix.columns:
	return target_correlations

	# Extract correlations with target variable
	target_corr_series = corr_matrix[target_col]

	for feature, correlation in target_corr_series.items():
	if feature != target_col and not pd.isna(correlation):
	target_correlations.append({
	'feature': feature,
	'correlation': float(correlation),
	'abs_correlation': abs(float(correlation)),
	'direction': 'positive' if correlation > 0 else 'negative'
	})

	# Sort by absolute value
	target_correlations.sort(key=lambda x: x['abs_correlation'], reverse=True)

	logger.info(f"Correlations with target variable calculated: {len(target_correlations)}")
	return target_correlations

	def _compute_vif_scores(self, data: pd.DataFrame) -> Dict[str, Any]:
	"""Calculate VIF (Variance Inflation Factor)"""
	logger.info("Analysing multicollinearity (VIF)...")

	vif_results = {
	'scores': {},
	'issues': [],
	'summary': {
	'critical': 0,
	'high': 0,
	'medium': 0,
	'low': 0
	}
	}

	try:
	from statsmodels.stats.outliers_influence import variance_inflation_factor
	import statsmodels.api as sm

	# Prepare data
	numeric_data = data.select_dtypes(include=[np.number])
	numeric_data = numeric_data.loc[:, numeric_data.nunique() > 1]

	# Remove missing and infinite values
	clean_data = numeric_data.replace([np.inf, -np.inf], np.nan).dropna()

	if clean_data.shape[0] < 10 or clean_data.shape[1] < 2:
	logger.warning("Insufficient data for VIF analysis")
	return vif_results

	# Add constant
	X = sm.add_constant(clean_data, has_constant='add')

	# Calculate VIF for each feature
	vif_scores = {}
	for i, column in enumerate(X.columns):
	if column == 'const':
	continue

	try:
	vif = variance_inflation_factor(X.values, i)

	# Handle extreme values
	if np.isinf(vif) or vif > 1e6:
	vif = 1e6

	vif_scores[column] = float(vif)

	# Classify by severity
	if vif > 100:
	vif_results['summary']['critical'] += 1
	vif_results['issues'].append({
	'feature': column,
	'vif': float(vif),
	'severity': 'critical',
	'recommendation': 'Remove feature'
	})
	elif vif > 10:
	vif_results['summary']['high'] += 1
	vif_results['issues'].append({
	'feature': column,
	'vif': float(vif),
	'severity': 'high',
	'recommendation': 'Consider removal'
	})
	elif vif > 5:
	vif_results['summary']['medium'] += 1
	else:
	vif_results['summary']['low'] += 1

	except Exception as e:
	logger.warning(f"VIF error for {column}: {e}")
	vif_scores[column] = np.nan

	vif_results['scores'] = vif_scores
	self.vif_scores = vif_scores

	logger.info(f"✓ VIF analysis completed. Critical features: {vif_results['summary']['critical']}")

	except ImportError:
	logger.warning("statsmodels not installed, skipping VIF analysis")
	except Exception as e:
	logger.error(f"VIF analysis error: {e}")

	return vif_results

	def _detailed_correlation_analysis(
	self,
	data: pd.DataFrame,
	corr_matrix: pd.DataFrame,
	target_col: str
	) -> None:
	"""Detailed correlation analysis"""
	# Analyse correlation clusters
	if not corr_matrix.empty and corr_matrix.shape[0] > 3:
	try:
	# Use clustering to group correlated features
	from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
	from scipy.spatial.distance import squareform

	# Convert correlations to distances
	distance_matrix = 1 - abs(corr_matrix)
	np.fill_diagonal(distance_matrix.values, 0)

	# Clustering
	condensed_dist = squareform(distance_matrix)
	Z = linkage(condensed_dist, method='average')

	# Determine clusters
	clusters = fcluster(Z, t=0.5, criterion='distance')

	# Group features by cluster
	feature_clusters = {}
	for idx, cluster_id in enumerate(clusters):
	feature = corr_matrix.columns[idx]
	if cluster_id not in feature_clusters:
	feature_clusters[cluster_id] = []
	feature_clusters[cluster_id].append(feature)

	# Save cluster information
	self.multicollinearity_info['correlation_clusters'] = feature_clusters
	logger.info(f"Correlated feature clusters detected: {len(feature_clusters)}")

	except Exception as e:
	logger.debug(f"Cluster analysis failed: {e}")

	def _plot_correlation_analysis(
	self,
	data: pd.DataFrame,
	corr_matrix: pd.DataFrame,
	target_col: str,
	high_correlations: List[Dict[str, Any]],
	vif_results: Dict[str, Any]
	) -> None:
	"""Visualise correlation analysis"""
	try:
	import matplotlib.pyplot as plt
	import seaborn as sns
	from matplotlib import rcParams

	# Style settings
	plt.style.use('seaborn-v0_8-darkgrid')
	rcParams.update({
	'figure.figsize': (12, 8),
	'font.size': 10,
	'axes.titlesize': 14,
	'axes.labelsize': 12
	})

	# Create directory
	plots_dir = os.path.join(self.config.results_dir, 'plots', 'correlations')
	os.makedirs(plots_dir, exist_ok=True)

	# 1. Correlation matrix heatmap
	if not corr_matrix.empty and corr_matrix.shape[0] > 1:
	fig, ax = plt.subplots(figsize=(14, 12))

	mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
	sns.heatmap(
	corr_matrix,
	mask=mask,
	annot=True,
	fmt='.2f',
	cmap='coolwarm',
	center=0,
	square=True,
	linewidths=0.5,
	cbar_kws={"shrink": 0.8},
	ax=ax
	)
	ax.set_title('Correlation Matrix (Pearson)', fontweight='bold')
	plt.tight_layout()
	plt.savefig(os.path.join(plots_dir, 'correlation_matrix.png'),
	dpi=150, bbox_inches='tight')
	plt.close()

	# 2. Target variable correlations
	if target_col in corr_matrix.columns:
	target_corrs = corr_matrix[target_col].drop(target_col, errors='ignore')
	if not target_corrs.empty:
	fig, ax = plt.subplots(figsize=(10, 8))

	top_corrs = target_corrs.abs().sort_values(ascending=True).tail(20)
	colors = ['red' if target_corrs[feat] < 0 else 'blue'
	for feat in top_corrs.index]

	ax.barh(range(len(top_corrs)), top_corrs.values, color=colors)
	ax.set_yticks(range(len(top_corrs)))
	ax.set_yticklabels(top_corrs.index)
	ax.set_xlabel('Absolute correlation')
	ax.set_title(f'Top-20 correlations with {target_col}', fontweight='bold')
	ax.grid(True, alpha=0.3, axis='x')

	plt.tight_layout()
	plt.savefig(os.path.join(plots_dir, 'target_correlations.png'),
	dpi=150, bbox_inches='tight')
	plt.close()

	# 3. VIF scores plot
	if vif_results['scores']:
	valid_scores = {k: v for k, v in vif_results['scores'].items()
	if not pd.isna(v)}
	if valid_scores:
	fig, ax = plt.subplots(figsize=(12, 8))

	sorted_scores = dict(sorted(valid_scores.items(),
	key=lambda x: x[1],
	reverse=True)[:25])

	colors = []
	for vif in sorted_scores.values():
	if vif > 100:
	colors.append('red')
	elif vif > 10:
	colors.append('orange')
	elif vif > 5:
	colors.append('yellow')
	else:
	colors.append('green')

	bars = ax.barh(list(sorted_scores.keys()),
	list(sorted_scores.values()),
	color=colors, edgecolor='black')

	ax.set_xlabel('VIF Score')
	ax.set_title('VIF Scores (multicollinearity)', fontweight='bold')
	ax.axvline(x=5, color='yellow', linestyle='--', alpha=0.7)
	ax.axvline(x=10, color='orange', linestyle='--', alpha=0.7)
	ax.axvline(x=100, color='red', linestyle='--', alpha=0.7)
	ax.grid(True, alpha=0.3, axis='x')

	plt.tight_layout()
	plt.savefig(os.path.join(plots_dir, 'vif_scores.png'),
	dpi=150, bbox_inches='tight')
	plt.close()

	# 4. High correlations plot
	if high_correlations:
	fig, ax = plt.subplots(figsize=(12, 8))

	# Limit number for display
	display_corrs = high_correlations[:15]

	# Create labels for feature pairs
	labels = [f"{corr['feature1']} ↔ {corr['feature2']}"
	for corr in display_corrs]
	values = [corr['correlation'] for corr in display_corrs]
	colors = ['red' if v < 0 else 'blue' for v in values]

	y_pos = np.arange(len(display_corrs))
	ax.barh(y_pos, values, color=colors)
	ax.set_yticks(y_pos)
	ax.set_yticklabels(labels, fontsize=9)
	ax.invert_yaxis()
	ax.set_xlabel('Correlation')
	ax.set_title('High correlations (> 0.8)', fontweight='bold')
	ax.grid(True, alpha=0.3, axis='x')

	plt.tight_layout()
	plt.savefig(os.path.join(plots_dir, 'high_correlations.png'),
	dpi=150, bbox_inches='tight')
	plt.close()

	logger.info(f"Visualisations saved to {plots_dir}")

	except Exception as e:
	logger.warning(f"Error creating visualisations: {e}")

	def _log_analysis_results(
	self,
	corr_matrix: pd.DataFrame,
	high_correlations: List[Dict[str, Any]],
	target_correlations: List[Dict[str, Any]],
	vif_results: Dict[str, Any]
	) -> None:
	"""Log analysis results"""
	logger.info("\n" + "="*80)
	logger.info("CORRELATION AND MULTICOLLINEARITY ANALYSIS REPORT")
	logger.info("="*80)

	# General information
	logger.info(f"\n📊 GENERAL INFORMATION:")
	logger.info(f" Correlation matrix size: {corr_matrix.shape}")
	logger.info(f" Total features: {len(corr_matrix.columns)}")

	# High correlations
	if high_correlations:
	logger.info(f"\n⚠ HIGH CORRELATIONS (\|r\| > 0.8): {len(high_correlations)}")
	logger.info(" " + "-" * 60)

	for i, corr in enumerate(high_correlations[:10]):
	sign = "🟥" if corr['correlation'] < 0 else "🟩"
	logger.info(f" {i+1:2d}. {sign} {corr['feature1']:25s} ↔ {corr['feature2']:25s}: {corr['correlation']:7.4f}")

	if len(high_correlations) > 10:
	logger.info(f" ... and {len(high_correlations) - 10} more pairs")

	# Target variable correlations
	if target_correlations:
	logger.info(f"\n🎯 CORRELATIONS WITH TARGET VARIABLE:")
	logger.info(" " + "-" * 60)

	for i, corr in enumerate(target_correlations[:10]):
	direction = "↓" if corr['correlation'] < 0 else "↑"
	logger.info(f" {i+1:2d}. {direction} {corr['feature']:35s}: {corr['correlation']:7.4f}")

	# Multicollinearity analysis
	if vif_results['scores']:
	logger.info(f"\n📈 MULTICOLLINEARITY ANALYSIS (VIF):")
	logger.info(" " + "-" * 60)
	logger.info(f" Critical (VIF > 100): {vif_results['summary']['critical']}")
	logger.info(f" High (10 < VIF ≤ 100): {vif_results['summary']['high']}")
	logger.info(f" Medium (5 < VIF ≤ 10): {vif_results['summary']['medium']}")
	logger.info(f" Low (VIF ≤ 5): {vif_results['summary']['low']}")

	# Top problematic features
	if vif_results['issues']:
	logger.info(f"\n🔴 PROBLEMATIC FEATURES (VIF > 10):")
	for issue in vif_results['issues'][:10]:
	logger.info(f" • {issue['feature']:35s}: VIF = {issue['vif']:7.1f} ({issue['severity']})")

	logger.info("\n" + "="*80)
	logger.info("RECOMMENDATIONS:")
	logger.info("="*80)

	# Generate recommendations
	recommendations = []

	if len(high_correlations) > 20:
	recommendations.append("1. Remove highly correlated features (correlation method)")

	if vif_results['summary']['critical'] > 0:
	recommendations.append("2. Remove features with critical VIF (>100)")

	if vif_results['summary']['high'] > 5:
	recommendations.append("3. Consider removing features with VIF > 10")

	if not recommendations:
	recommendations.append("1. Data in good condition, no serious issues detected")
	recommendations.append("2. Proceed to modelling")

	for i, rec in enumerate(recommendations, 1):
	logger.info(f" {rec}")

	logger.info("\n" + "="*80)

	def remove_highly_correlated(
	self,
	data: pd.DataFrame,
	threshold: float = 0.85,
	method: str = 'variance',
	keep_target: bool = True,
	keep_features: List[str] = None
	) -> pd.DataFrame:
	"""
	Remove highly correlated features

	Parameters:
	-----------
	data : pd.DataFrame
	Source data
	threshold : float
	Correlation threshold for removal
	method : str
	Feature selection method for removal: 'variance', 'random', 'importance'
	keep_target : bool
	Whether to keep target variable
	keep_features : List[str], optional
	Features to keep

	Returns:
	--------
	pd.DataFrame
	Data after removing highly correlated features
	"""
	logger.info("\n" + "="*80)
	logger.info("REMOVING HIGHLY CORRELATED FEATURES")
	logger.info("="*80)

	data_clean = data.copy()

	if 'pearson' not in self.correlation_matrices:
	logger.warning("Correlation matrix not calculated, run analyze() first")
	return data_clean

	corr_matrix = self.correlation_matrices['pearson']

	# Features to keep
	features_to_keep = set()

	if keep_target and self.config.target_column in data_clean.columns:
	features_to_keep.add(self.config.target_column)

	if keep_features:
	for feat in keep_features:
	if feat in data_clean.columns:
	features_to_keep.add(feat)

	# Temporal features (usually important for time series)
	temporal_patterns = ['year', 'month', 'day', 'week', 'quarter',
	'hour', 'minute', 'second', 'sin', 'cos']

	for col in data_clean.columns:
	if any(pattern in col.lower() for pattern in temporal_patterns):
	features_to_keep.add(col)

	# Find highly correlated pairs
	upper_triangle = corr_matrix.where(
	np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
	)

	# Collect highly correlated features
	correlated_features = set()
	for col in upper_triangle.columns:
	if col in features_to_keep:
	continue

	high_corr = upper_triangle[col][abs(upper_triangle[col]) > threshold]
	for row_idx, corr_value in high_corr.items():
	if not pd.isna(corr_value) and row_idx not in features_to_keep:
	# Select which feature to remove
	if method == 'variance':
	# Remove the one with lower variance
	var_col = data_clean[col].var()
	var_row = data_clean[row_idx].var()
	feature_to_remove = col if var_col < var_row else row_idx
	elif method == 'importance':
	# Remove the one with lower correlation to target variable
	if self.config.target_column in corr_matrix.columns:
	corr_col_target = abs(corr_matrix.loc[col, self.config.target_column])
	corr_row_target = abs(corr_matrix.loc[row_idx, self.config.target_column])
	feature_to_remove = col if corr_col_target < corr_row_target else row_idx
	else:
	# If no target, remove randomly
	feature_to_remove = np.random.choice([col, row_idx])
	else:
	# Remove randomly
	feature_to_remove = np.random.choice([col, row_idx])

	correlated_features.add(feature_to_remove)

	# Remove features
	features_to_remove = list(correlated_features)

	if features_to_remove:
	data_clean = data_clean.drop(columns=features_to_remove)

	logger.info(f"\n📊 REMOVAL RESULTS:")
	logger.info(f" Initial feature count: {len(data.columns)}")
	logger.info(f" Features removed: {len(features_to_remove)}")
	logger.info(f" Final feature count: {len(data_clean.columns)}")
	logger.info(f" Retained: {len(data_clean.columns)/len(data.columns)*100:.1f}%")

	if features_to_remove:
	logger.info(f"\n🗑️ REMOVED FEATURES:")
	for i, feat in enumerate(sorted(features_to_remove)[:20]):
	logger.info(f" {i+1:2d}. {feat}")
	if len(features_to_remove) > 20:
	logger.info(f" ... and {len(features_to_remove) - 20} more features")
	else:
	logger.info("✓ No highly correlated features detected, all features retained")

	logger.info("="*80)
	return data_clean

	def get_report(self) -> Dict[str, Any]:
	"""Get analysis report"""
	report = {
	"correlation_matrix_shape": None,
	"high_correlation_count": 0,
	"vif_summary": {},
	"target_correlation_count": 0
	}

	if 'pearson' in self.correlation_matrices:
	report["correlation_matrix_shape"] = self.correlation_matrices['pearson'].shape

	if 'pearson' in self.high_correlation_pairs:
	report["high_correlation_count"] = len(self.high_correlation_pairs['pearson'])

	if self.vif_scores:
	report["vif_summary"] = self.vif_scores.get('summary', {})

	return report