File size: 27,992 Bytes
d8f69a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
# ============================================
# CLASS 8: CORRELATION AND MULTICOLLINEARITY ANALYSIS
# ============================================
import os
import traceback
from typing import Any, Dict, List, Optional
from venv import logger

from config.config import Config
import numpy as np
import pandas as pd

class CorrelationAnalyzer:
    """Class for comprehensive correlation and multicollinearity analysis"""
    
    def __init__(self, config: Config):
        """
        Initialise the analyser
        
        Parameters:
        -----------
        config : Config
            Experiment configuration
        """
        self.config = config
        self.correlation_matrices = {}
        self.high_correlation_pairs = {}
        self.multicollinearity_info = {}
        self.vif_scores = {}
    
    def analyze(
        self, 
        data: pd.DataFrame, 
        target_col: Optional[str] = None,
        threshold: float = 0.8,
        detailed: bool = True,
        **kwargs
    ) -> pd.DataFrame:
        """
        Analyse correlations in the data
        
        Parameters:
        -----------
        data : pd.DataFrame
            Input data
        target_col : str, optional
            Target variable
        threshold : float
            Threshold for identifying high correlations
        detailed : bool
            Whether to perform detailed analysis
        **kwargs : dict
            Additional parameters
        
        Returns:
        --------
        pd.DataFrame
            Correlation matrix
        """
        logger.info("\n" + "="*80)
        logger.info("CORRELATION AND MULTICOLLINEARITY ANALYSIS")
        logger.info("="*80)
        
        target_col = target_col or self.config.target_column
        
        try:
            # 1. Calculate correlation matrix
            corr_matrix = self._compute_correlations(data, target_col)
            
            if corr_matrix.empty:
                logger.warning("Correlation matrix is empty")
                return pd.DataFrame()
            
            # 2. Identify high correlations
            high_correlations = self._detect_high_correlations(corr_matrix, threshold)
            self.high_correlation_pairs['pearson'] = high_correlations
            
            # 3. Analyse correlations with target variable
            target_correlations = []
            if target_col in corr_matrix.columns:
                target_correlations = self._get_target_correlations(corr_matrix, target_col)
            
            # 4. Analyse multicollinearity (VIF)
            vif_results = self._compute_vif_scores(data)
            
            # 5. Detailed analysis if required
            if detailed:
                self._detailed_correlation_analysis(data, corr_matrix, target_col)
            
            # 6. Visualisation
            if self.config.save_plots:
                self._plot_correlation_analysis(data, corr_matrix, target_col, high_correlations, vif_results)
            
            # 7. Output results
            self._log_analysis_results(corr_matrix, high_correlations, target_correlations, vif_results)
            
            return corr_matrix
            
        except Exception as e:
            logger.error(f"Error in correlation analysis: {e}")
            logger.error(traceback.format_exc())
            return pd.DataFrame()
    
    def _compute_correlations(
        self, 
        data: pd.DataFrame, 
        target_col: str
    ) -> pd.DataFrame:
        """Calculate correlation matrix"""
        logger.info("Calculating correlation matrix...")
        
        # Select only numeric columns
        numeric_data = data.select_dtypes(include=[np.number])
        
        # Remove constant columns
        numeric_data = numeric_data.loc[:, numeric_data.nunique() > 1]
        
        if numeric_data.shape[1] < 2:
            logger.warning("Insufficient numeric features for analysis")
            return pd.DataFrame()
        
        # Remove missing values
        numeric_data_clean = numeric_data.dropna()
        
        if len(numeric_data_clean) < 10:
            logger.warning("Insufficient data after cleaning")
            return pd.DataFrame()
        
        # Calculate Pearson correlation
        try:
            corr_matrix = numeric_data_clean.corr(method='pearson')
            self.correlation_matrices['pearson'] = corr_matrix
            logger.info(f"βœ“ Correlation matrix calculated: {corr_matrix.shape}")
            return corr_matrix
        except Exception as e:
            logger.error(f"Error calculating correlation: {e}")
            return pd.DataFrame()
    
    def _detect_high_correlations(
        self, 
        corr_matrix: pd.DataFrame, 
        threshold: float = 0.8
    ) -> List[Dict[str, Any]]:
        """Detect high correlations"""
        high_correlations = []
        
        if corr_matrix.empty:
            return high_correlations
        
        # Use upper triangle of matrix
        upper_triangle = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )
        
        # Find pairs with correlation above threshold
        for col in upper_triangle.columns:
            if col in upper_triangle:
                high_corr_series = upper_triangle[col][abs(upper_triangle[col]) > threshold]
                
                for row_idx, correlation in high_corr_series.items():
                    if not pd.isna(correlation):
                        high_correlations.append({
                            'feature1': row_idx,
                            'feature2': col,
                            'correlation': float(correlation),
                            'abs_correlation': abs(float(correlation))
                        })
        
        # Sort by absolute correlation value
        high_correlations.sort(key=lambda x: x['abs_correlation'], reverse=True)
        
        logger.info(f"High correlations detected (> {threshold}): {len(high_correlations)}")
        return high_correlations
    
    def _get_target_correlations(
        self, 
        corr_matrix: pd.DataFrame, 
        target_col: str
    ) -> List[Dict[str, Any]]:
        """Get correlations with target variable"""
        target_correlations = []
        
        if target_col not in corr_matrix.columns:
            return target_correlations
        
        # Extract correlations with target variable
        target_corr_series = corr_matrix[target_col]
        
        for feature, correlation in target_corr_series.items():
            if feature != target_col and not pd.isna(correlation):
                target_correlations.append({
                    'feature': feature,
                    'correlation': float(correlation),
                    'abs_correlation': abs(float(correlation)),
                    'direction': 'positive' if correlation > 0 else 'negative'
                })
        
        # Sort by absolute value
        target_correlations.sort(key=lambda x: x['abs_correlation'], reverse=True)
        
        logger.info(f"Correlations with target variable calculated: {len(target_correlations)}")
        return target_correlations
    
    def _compute_vif_scores(self, data: pd.DataFrame) -> Dict[str, Any]:
        """Calculate VIF (Variance Inflation Factor)"""
        logger.info("Analysing multicollinearity (VIF)...")
        
        vif_results = {
            'scores': {},
            'issues': [],
            'summary': {
                'critical': 0,
                'high': 0,
                'medium': 0,
                'low': 0
            }
        }
        
        try:
            from statsmodels.stats.outliers_influence import variance_inflation_factor
            import statsmodels.api as sm
            
            # Prepare data
            numeric_data = data.select_dtypes(include=[np.number])
            numeric_data = numeric_data.loc[:, numeric_data.nunique() > 1]
            
            # Remove missing and infinite values
            clean_data = numeric_data.replace([np.inf, -np.inf], np.nan).dropna()
            
            if clean_data.shape[0] < 10 or clean_data.shape[1] < 2:
                logger.warning("Insufficient data for VIF analysis")
                return vif_results
            
            # Add constant
            X = sm.add_constant(clean_data, has_constant='add')
            
            # Calculate VIF for each feature
            vif_scores = {}
            for i, column in enumerate(X.columns):
                if column == 'const':
                    continue
                    
                try:
                    vif = variance_inflation_factor(X.values, i)
                    
                    # Handle extreme values
                    if np.isinf(vif) or vif > 1e6:
                        vif = 1e6
                    
                    vif_scores[column] = float(vif)
                    
                    # Classify by severity
                    if vif > 100:
                        vif_results['summary']['critical'] += 1
                        vif_results['issues'].append({
                            'feature': column,
                            'vif': float(vif),
                            'severity': 'critical',
                            'recommendation': 'Remove feature'
                        })
                    elif vif > 10:
                        vif_results['summary']['high'] += 1
                        vif_results['issues'].append({
                            'feature': column,
                            'vif': float(vif),
                            'severity': 'high',
                            'recommendation': 'Consider removal'
                        })
                    elif vif > 5:
                        vif_results['summary']['medium'] += 1
                    else:
                        vif_results['summary']['low'] += 1
                        
                except Exception as e:
                    logger.warning(f"VIF error for {column}: {e}")
                    vif_scores[column] = np.nan
            
            vif_results['scores'] = vif_scores
            self.vif_scores = vif_scores
            
            logger.info(f"βœ“ VIF analysis completed. Critical features: {vif_results['summary']['critical']}")
            
        except ImportError:
            logger.warning("statsmodels not installed, skipping VIF analysis")
        except Exception as e:
            logger.error(f"VIF analysis error: {e}")
        
        return vif_results
    
    def _detailed_correlation_analysis(
        self, 
        data: pd.DataFrame,
        corr_matrix: pd.DataFrame,
        target_col: str
    ) -> None:
        """Detailed correlation analysis"""
        # Analyse correlation clusters
        if not corr_matrix.empty and corr_matrix.shape[0] > 3:
            try:
                # Use clustering to group correlated features
                from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
                from scipy.spatial.distance import squareform
                
                # Convert correlations to distances
                distance_matrix = 1 - abs(corr_matrix)
                np.fill_diagonal(distance_matrix.values, 0)
                
                # Clustering
                condensed_dist = squareform(distance_matrix)
                Z = linkage(condensed_dist, method='average')
                
                # Determine clusters
                clusters = fcluster(Z, t=0.5, criterion='distance')
                
                # Group features by cluster
                feature_clusters = {}
                for idx, cluster_id in enumerate(clusters):
                    feature = corr_matrix.columns[idx]
                    if cluster_id not in feature_clusters:
                        feature_clusters[cluster_id] = []
                    feature_clusters[cluster_id].append(feature)
                
                # Save cluster information
                self.multicollinearity_info['correlation_clusters'] = feature_clusters
                logger.info(f"Correlated feature clusters detected: {len(feature_clusters)}")
                
            except Exception as e:
                logger.debug(f"Cluster analysis failed: {e}")
    
    def _plot_correlation_analysis(
        self,
        data: pd.DataFrame,
        corr_matrix: pd.DataFrame,
        target_col: str,
        high_correlations: List[Dict[str, Any]],
        vif_results: Dict[str, Any]
    ) -> None:
        """Visualise correlation analysis"""
        try:
            import matplotlib.pyplot as plt
            import seaborn as sns
            from matplotlib import rcParams
            
            # Style settings
            plt.style.use('seaborn-v0_8-darkgrid')
            rcParams.update({
                'figure.figsize': (12, 8),
                'font.size': 10,
                'axes.titlesize': 14,
                'axes.labelsize': 12
            })
            
            # Create directory
            plots_dir = os.path.join(self.config.results_dir, 'plots', 'correlations')
            os.makedirs(plots_dir, exist_ok=True)
            
            # 1. Correlation matrix heatmap
            if not corr_matrix.empty and corr_matrix.shape[0] > 1:
                fig, ax = plt.subplots(figsize=(14, 12))
                
                mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
                sns.heatmap(
                    corr_matrix,
                    mask=mask,
                    annot=True,
                    fmt='.2f',
                    cmap='coolwarm',
                    center=0,
                    square=True,
                    linewidths=0.5,
                    cbar_kws={"shrink": 0.8},
                    ax=ax
                )
                ax.set_title('Correlation Matrix (Pearson)', fontweight='bold')
                plt.tight_layout()
                plt.savefig(os.path.join(plots_dir, 'correlation_matrix.png'), 
                           dpi=150, bbox_inches='tight')
                plt.close()
                
            # 2. Target variable correlations
            if target_col in corr_matrix.columns:
                target_corrs = corr_matrix[target_col].drop(target_col, errors='ignore')
                if not target_corrs.empty:
                    fig, ax = plt.subplots(figsize=(10, 8))
                    
                    top_corrs = target_corrs.abs().sort_values(ascending=True).tail(20)
                    colors = ['red' if target_corrs[feat] < 0 else 'blue' 
                             for feat in top_corrs.index]
                    
                    ax.barh(range(len(top_corrs)), top_corrs.values, color=colors)
                    ax.set_yticks(range(len(top_corrs)))
                    ax.set_yticklabels(top_corrs.index)
                    ax.set_xlabel('Absolute correlation')
                    ax.set_title(f'Top-20 correlations with {target_col}', fontweight='bold')
                    ax.grid(True, alpha=0.3, axis='x')
                    
                    plt.tight_layout()
                    plt.savefig(os.path.join(plots_dir, 'target_correlations.png'),
                               dpi=150, bbox_inches='tight')
                    plt.close()
            
            # 3. VIF scores plot
            if vif_results['scores']:
                valid_scores = {k: v for k, v in vif_results['scores'].items() 
                               if not pd.isna(v)}
                if valid_scores:
                    fig, ax = plt.subplots(figsize=(12, 8))
                    
                    sorted_scores = dict(sorted(valid_scores.items(), 
                                               key=lambda x: x[1], 
                                               reverse=True)[:25])
                    
                    colors = []
                    for vif in sorted_scores.values():
                        if vif > 100:
                            colors.append('red')
                        elif vif > 10:
                            colors.append('orange')
                        elif vif > 5:
                            colors.append('yellow')
                        else:
                            colors.append('green')
                    
                    bars = ax.barh(list(sorted_scores.keys()), 
                                  list(sorted_scores.values()),
                                  color=colors, edgecolor='black')
                    
                    ax.set_xlabel('VIF Score')
                    ax.set_title('VIF Scores (multicollinearity)', fontweight='bold')
                    ax.axvline(x=5, color='yellow', linestyle='--', alpha=0.7)
                    ax.axvline(x=10, color='orange', linestyle='--', alpha=0.7)
                    ax.axvline(x=100, color='red', linestyle='--', alpha=0.7)
                    ax.grid(True, alpha=0.3, axis='x')
                    
                    plt.tight_layout()
                    plt.savefig(os.path.join(plots_dir, 'vif_scores.png'),
                               dpi=150, bbox_inches='tight')
                    plt.close()
            
            # 4. High correlations plot
            if high_correlations:
                fig, ax = plt.subplots(figsize=(12, 8))
                
                # Limit number for display
                display_corrs = high_correlations[:15]
                
                # Create labels for feature pairs
                labels = [f"{corr['feature1']} ↔ {corr['feature2']}" 
                         for corr in display_corrs]
                values = [corr['correlation'] for corr in display_corrs]
                colors = ['red' if v < 0 else 'blue' for v in values]
                
                y_pos = np.arange(len(display_corrs))
                ax.barh(y_pos, values, color=colors)
                ax.set_yticks(y_pos)
                ax.set_yticklabels(labels, fontsize=9)
                ax.invert_yaxis()
                ax.set_xlabel('Correlation')
                ax.set_title('High correlations (> 0.8)', fontweight='bold')
                ax.grid(True, alpha=0.3, axis='x')
                
                plt.tight_layout()
                plt.savefig(os.path.join(plots_dir, 'high_correlations.png'),
                           dpi=150, bbox_inches='tight')
                plt.close()
            
            logger.info(f"Visualisations saved to {plots_dir}")
            
        except Exception as e:
            logger.warning(f"Error creating visualisations: {e}")
    
    def _log_analysis_results(
        self,
        corr_matrix: pd.DataFrame,
        high_correlations: List[Dict[str, Any]],
        target_correlations: List[Dict[str, Any]],
        vif_results: Dict[str, Any]
    ) -> None:
        """Log analysis results"""
        logger.info("\n" + "="*80)
        logger.info("CORRELATION AND MULTICOLLINEARITY ANALYSIS REPORT")
        logger.info("="*80)
        
        # General information
        logger.info(f"\nπŸ“Š GENERAL INFORMATION:")
        logger.info(f"   Correlation matrix size: {corr_matrix.shape}")
        logger.info(f"   Total features: {len(corr_matrix.columns)}")
        
        # High correlations
        if high_correlations:
            logger.info(f"\n⚠ HIGH CORRELATIONS (|r| > 0.8): {len(high_correlations)}")
            logger.info("   " + "-" * 60)
            
            for i, corr in enumerate(high_correlations[:10]):
                sign = "πŸŸ₯" if corr['correlation'] < 0 else "🟩"
                logger.info(f"   {i+1:2d}. {sign} {corr['feature1']:25s} ↔ {corr['feature2']:25s}: {corr['correlation']:7.4f}")
            
            if len(high_correlations) > 10:
                logger.info(f"   ... and {len(high_correlations) - 10} more pairs")
        
        # Target variable correlations
        if target_correlations:
            logger.info(f"\n🎯 CORRELATIONS WITH TARGET VARIABLE:")
            logger.info("   " + "-" * 60)
            
            for i, corr in enumerate(target_correlations[:10]):
                direction = "↓" if corr['correlation'] < 0 else "↑"
                logger.info(f"   {i+1:2d}. {direction} {corr['feature']:35s}: {corr['correlation']:7.4f}")
        
        # Multicollinearity analysis
        if vif_results['scores']:
            logger.info(f"\nπŸ“ˆ MULTICOLLINEARITY ANALYSIS (VIF):")
            logger.info("   " + "-" * 60)
            logger.info(f"   Critical (VIF > 100): {vif_results['summary']['critical']}")
            logger.info(f"   High (10 < VIF ≀ 100): {vif_results['summary']['high']}")
            logger.info(f"   Medium (5 < VIF ≀ 10): {vif_results['summary']['medium']}")
            logger.info(f"   Low (VIF ≀ 5): {vif_results['summary']['low']}")
            
            # Top problematic features
            if vif_results['issues']:
                logger.info(f"\nπŸ”΄ PROBLEMATIC FEATURES (VIF > 10):")
                for issue in vif_results['issues'][:10]:
                    logger.info(f"   β€’ {issue['feature']:35s}: VIF = {issue['vif']:7.1f} ({issue['severity']})")
        
        logger.info("\n" + "="*80)
        logger.info("RECOMMENDATIONS:")
        logger.info("="*80)
        
        # Generate recommendations
        recommendations = []
        
        if len(high_correlations) > 20:
            recommendations.append("1. Remove highly correlated features (correlation method)")
        
        if vif_results['summary']['critical'] > 0:
            recommendations.append("2. Remove features with critical VIF (>100)")
        
        if vif_results['summary']['high'] > 5:
            recommendations.append("3. Consider removing features with VIF > 10")
        
        if not recommendations:
            recommendations.append("1. Data in good condition, no serious issues detected")
            recommendations.append("2. Proceed to modelling")
        
        for i, rec in enumerate(recommendations, 1):
            logger.info(f"   {rec}")
        
        logger.info("\n" + "="*80)
    
    def remove_highly_correlated(
        self,
        data: pd.DataFrame,
        threshold: float = 0.85,
        method: str = 'variance',
        keep_target: bool = True,
        keep_features: List[str] = None
    ) -> pd.DataFrame:
        """
        Remove highly correlated features
        
        Parameters:
        -----------
        data : pd.DataFrame
            Source data
        threshold : float
            Correlation threshold for removal
        method : str
            Feature selection method for removal: 'variance', 'random', 'importance'
        keep_target : bool
            Whether to keep target variable
        keep_features : List[str], optional
            Features to keep
        
        Returns:
        --------
        pd.DataFrame
            Data after removing highly correlated features
        """
        logger.info("\n" + "="*80)
        logger.info("REMOVING HIGHLY CORRELATED FEATURES")
        logger.info("="*80)
        
        data_clean = data.copy()
        
        if 'pearson' not in self.correlation_matrices:
            logger.warning("Correlation matrix not calculated, run analyze() first")
            return data_clean
        
        corr_matrix = self.correlation_matrices['pearson']
        
        # Features to keep
        features_to_keep = set()
        
        if keep_target and self.config.target_column in data_clean.columns:
            features_to_keep.add(self.config.target_column)
        
        if keep_features:
            for feat in keep_features:
                if feat in data_clean.columns:
                    features_to_keep.add(feat)
        
        # Temporal features (usually important for time series)
        temporal_patterns = ['year', 'month', 'day', 'week', 'quarter', 
                            'hour', 'minute', 'second', 'sin', 'cos']
        
        for col in data_clean.columns:
            if any(pattern in col.lower() for pattern in temporal_patterns):
                features_to_keep.add(col)
        
        # Find highly correlated pairs
        upper_triangle = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )
        
        # Collect highly correlated features
        correlated_features = set()
        for col in upper_triangle.columns:
            if col in features_to_keep:
                continue
                
            high_corr = upper_triangle[col][abs(upper_triangle[col]) > threshold]
            for row_idx, corr_value in high_corr.items():
                if not pd.isna(corr_value) and row_idx not in features_to_keep:
                    # Select which feature to remove
                    if method == 'variance':
                        # Remove the one with lower variance
                        var_col = data_clean[col].var()
                        var_row = data_clean[row_idx].var()
                        feature_to_remove = col if var_col < var_row else row_idx
                    elif method == 'importance':
                        # Remove the one with lower correlation to target variable
                        if self.config.target_column in corr_matrix.columns:
                            corr_col_target = abs(corr_matrix.loc[col, self.config.target_column])
                            corr_row_target = abs(corr_matrix.loc[row_idx, self.config.target_column])
                            feature_to_remove = col if corr_col_target < corr_row_target else row_idx
                        else:
                            # If no target, remove randomly
                            feature_to_remove = np.random.choice([col, row_idx])
                    else:
                        # Remove randomly
                        feature_to_remove = np.random.choice([col, row_idx])
                    
                    correlated_features.add(feature_to_remove)
        
        # Remove features
        features_to_remove = list(correlated_features)
        
        if features_to_remove:
            data_clean = data_clean.drop(columns=features_to_remove)
            
            logger.info(f"\nπŸ“Š REMOVAL RESULTS:")
            logger.info(f"   Initial feature count: {len(data.columns)}")
            logger.info(f"   Features removed: {len(features_to_remove)}")
            logger.info(f"   Final feature count: {len(data_clean.columns)}")
            logger.info(f"   Retained: {len(data_clean.columns)/len(data.columns)*100:.1f}%")
            
            if features_to_remove:
                logger.info(f"\nπŸ—‘οΈ REMOVED FEATURES:")
                for i, feat in enumerate(sorted(features_to_remove)[:20]):
                    logger.info(f"   {i+1:2d}. {feat}")
                if len(features_to_remove) > 20:
                    logger.info(f"   ... and {len(features_to_remove) - 20} more features")
        else:
            logger.info("βœ“ No highly correlated features detected, all features retained")
        
        logger.info("="*80)
        return data_clean
    
    def get_report(self) -> Dict[str, Any]:
        """Get analysis report"""
        report = {
            "correlation_matrix_shape": None,
            "high_correlation_count": 0,
            "vif_summary": {},
            "target_correlation_count": 0
        }
        
        if 'pearson' in self.correlation_matrices:
            report["correlation_matrix_shape"] = self.correlation_matrices['pearson'].shape
        
        if 'pearson' in self.high_correlation_pairs:
            report["high_correlation_count"] = len(self.high_correlation_pairs['pearson'])
        
        if self.vif_scores:
            report["vif_summary"] = self.vif_scores.get('summary', {})
        
        return report