Spaces:

arabovs-ai-lab
/

TimeFlowPro1

Runtime error

File size: 26,791 Bytes

bd3c428

# ============================================
# CLASS 3: MISSING VALUE ANALYSER
# ============================================
from typing import Dict, Tuple
from venv import logger

from config.config import Config
from scipy.interpolate import interp1d
from statsmodels.tsa.seasonal import seasonal_decompose, STL
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    print("✅ All imports working!")
except ImportError as e:
    print(f"❌ Import error: {e}")

class MissingValueAnalyser:
    """Class for analysing and handling missing values"""
    
    def __init__(self, config: Config):
        """
        Initialise missing value analyser
        
        Parameters:
        -----------
        config : Config
            Experiment configuration
        """
        self.config = config
        self.missing_info = {}
        self.handling_methods = {}
        self.imputers = {}
        self.missing_patterns = {}
    
    def analyse(
        self, 
        data: pd.DataFrame, 
        detailed: bool = True
    ) -> Dict:
        """
        Analyse missing values in data
        
        Parameters:
        -----------
        data : pd.DataFrame
            Input data
        detailed : bool
            Whether to perform detailed analysis
        
        Returns:
        --------
        Dict
            Information about missing values
        """
        logger.info("\n" + "="*80)
        logger.info("MISSING VALUE ANALYSIS")
        logger.info("="*80)
        
        # Calculate missing values
        missing_total = data.isnull().sum()
        missing_percent = (missing_total / len(data)) * 100
        
        missing_df = pd.DataFrame({
            'missing_count': missing_total,
            'missing_percent': missing_percent,
            'dtype': data.dtypes.astype(str)
        })
        
        # Detailed analysis
        if detailed:
            self._detailed_missing_analysis(data, missing_df)
        
        # Save information
        self.missing_info = {
            'summary': {
                col: {
                    'missing_count': int(missing_df.loc[col, 'missing_count']),
                    'missing_percent': float(missing_df.loc[col, 'missing_percent']),
                    'dtype': missing_df.loc[col, 'dtype']
                }
                for col in missing_df.index
            },
            'overall': {
                'total_missing': int(missing_total.sum()),
                'total_rows': int(len(data)),
                'total_cells': int(data.size),
                'overall_missing_percentage': float(missing_total.sum() / data.size * 100),
                'rows_with_any_missing': int(data.isnull().any(axis=1).sum()),
                'rows_all_missing': int(data.isnull().all(axis=1).sum()),
                'columns_with_missing': missing_df[missing_df['missing_count'] > 0].index.tolist(),
                'columns_all_missing': missing_df[missing_df['missing_count'] == len(data)].index.tolist()
            }
        }
        
        # Visualisation
        if self.config.save_plots:
            self._plot_missing_values(data, missing_df)
        
        # Output results
        self._log_missing_summary(missing_df)
        
        return self.missing_info
    
    def _detailed_missing_analysis(
        self, 
        data: pd.DataFrame, 
        missing_df: pd.DataFrame
    ) -> None:
        """Detailed missing value analysis"""
        # Analyse missing patterns
        missing_matrix = data.isnull()
        
        # Row missing patterns
        row_patterns = missing_matrix.apply(lambda x: ''.join(x.astype(int).astype(str)), axis=1)
        row_pattern_counts = row_patterns.value_counts().head(10)
        
        # Column missing patterns
        col_patterns = missing_matrix.apply(lambda x: ''.join(x.astype(int).astype(str)), axis=0)
        col_pattern_counts = col_patterns.value_counts().head(10)
        
        # Time-based missing patterns analysis
        time_patterns = {}
        if isinstance(data.index, pd.DatetimeIndex):
            # Missing values by time
            time_missing = data.isnull().resample('M').sum()
            time_patterns['monthly_missing'] = time_missing.sum(axis=1).to_dict()
            
            # Missing values by day of week
            data_with_dow = data.copy()
            data_with_dow['dayofweek'] = data.index.dayofweek
            dow_missing = data_with_dow.groupby('dayofweek').apply(lambda x: x.isnull().sum().sum())
            time_patterns['dayofweek_missing'] = dow_missing.to_dict()
        
        self.missing_patterns = {
            'row_patterns': row_pattern_counts.to_dict(),
            'col_patterns': col_pattern_counts.to_dict(),
            'time_patterns': time_patterns,
            'missing_correlation': missing_matrix.corr().to_dict()  # Missing value correlation
        }
        
        logger.debug(f"Found {len(row_pattern_counts)} unique row missing patterns")
        logger.debug(f"Found {len(col_pattern_counts)} unique column missing patterns")
    
    def _plot_missing_values(
        self, 
        data: pd.DataFrame, 
        missing_df: pd.DataFrame
    ) -> None:
        """Visualise missing values"""
        fig, axes = plt.subplots(3, 2, figsize=(16, 12))
        
        # 1. Missing percentage histogram
        axes[0, 0].barh(
            missing_df.index, 
            missing_df['missing_percent']
        )
        axes[0, 0].axvline(self.config.missing_threshold, color='red', linestyle='--')
        axes[0, 0].set_title('Missing Percentage by Column')
        axes[0, 0].set_xlabel('Missing Percentage (%)')
        axes[0, 0].set_ylabel('Columns')
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. Missing values heatmap
        missing_matrix = data.isnull()
        axes[0, 1].imshow(
            missing_matrix.T if len(data) > 1000 else missing_matrix.T[:1000],
            aspect='auto', 
            cmap='binary', 
            interpolation='none'
        )
        axes[0, 1].set_title('Missing Values Matrix')
        axes[0, 1].set_xlabel('Observation Index')
        axes[0, 1].set_ylabel('Variables')
        axes[0, 1].set_yticks(range(len(data.columns)))
        axes[0, 1].set_yticklabels(data.columns, fontsize=8)
        
        # 3. Missing values over time (if time series)
        if isinstance(data.index, pd.DatetimeIndex):
            time_missing = data.isnull().resample('M').sum()
            
            axes[1, 0].plot(time_missing.sum(axis=1))
            axes[1, 0].set_title('Missing Values by Month')
            axes[1, 0].set_xlabel('Date')
            axes[1, 0].set_ylabel('Number of Missing Values')
            axes[1, 0].grid(True, alpha=0.3)
            
            # 4. Missing values by day of week
            data_with_dow = data.copy()
            data_with_dow['dayofweek'] = data.index.dayofweek
            dow_missing = data_with_dow.groupby('dayofweek').apply(lambda x: x.isnull().sum().sum())
            dow_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
            
            axes[1, 1].bar(range(7), dow_missing)
            axes[1, 1].set_title('Missing Values by Day of Week')
            axes[1, 1].set_xlabel('Day of Week')
            axes[1, 1].set_ylabel('Number of Missing Values')
            axes[1, 1].set_xticks(range(7))
            axes[1, 1].set_xticklabels(dow_names)
            axes[1, 1].grid(True, alpha=0.3)
        
        # 5. Missing value correlation
        missing_corr = data.isnull().corr()
        im = axes[2, 0].imshow(
            missing_corr, 
            cmap='coolwarm', 
            vmin=-1, 
            vmax=1,
            aspect='auto'
        )
        axes[2, 0].set_title('Missing Value Correlation Between Variables')
        axes[2, 0].set_xlabel('Variables')
        axes[2, 0].set_ylabel('Variables')
        plt.colorbar(im, ax=axes[2, 0])
        
        # 6. Cumulative missing sum
        cumulative_missing = data.isnull().cumsum()
        for col in data.columns[:5]:  # First 5 columns
            if data[col].isnull().any():
                axes[2, 1].plot(
                    cumulative_missing.index, 
                    cumulative_missing[col],
                    label=col[:20]
                )
        axes[2, 1].set_title('Cumulative Missing Values')
        axes[2, 1].set_xlabel('Time/Index')
        axes[2, 1].set_ylabel('Cumulative Missing')
        axes[2, 1].legend(fontsize=8)
        axes[2, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(
            f'{self.config.results_dir}/plots/missing_values_analysis.png',
            dpi=300, 
            bbox_inches='tight'
        )
        plt.show()
    
    def _log_missing_summary(self, missing_df: pd.DataFrame) -> None:
        """Log missing value summary"""
        missing_columns = missing_df[missing_df['missing_count'] > 0]
        
        if len(missing_columns) > 0:
            logger.info("MISSING VALUES FOUND:")
            logger.info("-" * 50)
            logger.info(f"Total missing values: {self.missing_info['overall']['total_missing']}")
            logger.info(f"Overall missing percentage: {self.missing_info['overall']['overall_missing_percentage']:.2f}%")
            logger.info(f"Rows with missing values: {self.missing_info['overall']['rows_with_any_missing']}")
            logger.info(f"Columns with missing values: {len(self.missing_info['overall']['columns_with_missing'])}")
            
            logger.info("\nTop-10 columns by missing values:")
            top_missing = missing_df.nlargest(10, 'missing_percent')
            for idx, (col, row) in enumerate(top_missing.iterrows(), 1):
                logger.info(f"  {idx:2d}. {col}: {int(row['missing_count'])} missing ({row['missing_percent']:.2f}%)")
        else:
            logger.info("✓ No missing values found")
    
    def handle(
        self, 
        data: pd.DataFrame, 
        method: str = 'interpolate',
        strategy: str = 'columnwise',
        **kwargs
    ) -> pd.DataFrame:
        """
        Handle missing values
        
        Parameters:
        -----------
        data : pd.DataFrame
            Input data
        method : str
            Handling method: 'interpolate', 'ffill', 'bfill', 'mean', 'median', 'mode', 'knn', 'regression'
        strategy : str
            Strategy: 'columnwise', 'rowwise', 'global'
        **kwargs : dict
            Additional parameters for method
        
        Returns:
        --------
        pd.DataFrame
            Data with handled missing values
        """
        logger.info("\n" + "="*80)
        logger.info("HANDLING MISSING VALUES")
        logger.info("="*80)
        
        data_processed = data.copy()
        methods_applied = {}
        
        # Determine columns to process
        if strategy == 'columnwise':
            columns_to_process = data_processed.columns
        elif strategy == 'rowwise':
            # Row-wise handling (for time series)
            data_processed = self._handle_rowwise(data_processed, method, **kwargs)
            return data_processed
        else:
            columns_to_process = data_processed.select_dtypes(include=[np.number]).columns
        
        # Process each column
        for col in columns_to_process:
            missing_before = data_processed[col].isnull().sum()
            
            if missing_before > 0:
                # Check if missing percentage exceeds threshold
                missing_percent = (missing_before / len(data_processed)) * 100
                
                if missing_percent > self.config.missing_threshold:
                    logger.warning(f"  {col}: {missing_before} missing ({missing_percent:.1f}%) > threshold {self.config.missing_threshold}%")
                    
                    if kwargs.get('drop_high_missing', False):
                        data_processed = data_processed.drop(columns=[col])
                        method_used = f"dropped (>{self.config.missing_threshold}% missing)"
                        missing_after = 0
                    else:
                        # Use selected method
                        data_processed[col], method_used = self._apply_imputation_method(
                            data_processed[col], method, **kwargs
                        )
                        missing_after = data_processed[col].isnull().sum()
                else:
                    # Use selected method
                    data_processed[col], method_used = self._apply_imputation_method(
                        data_processed[col], method, **kwargs
                    )
                    missing_after = data_processed[col].isnull().sum()
                
                methods_applied[col] = {
                    'method': method_used,
                    'missing_before': int(missing_before),
                    'missing_after': int(missing_after),
                    'missing_percent_before': float(missing_percent)
                }
                
                if missing_before > 0:
                    logger.info(f"  {col}: {missing_before} → {missing_after} missing ({method_used})")
        
        self.handling_methods = methods_applied
        
        # Check that all missing values are handled
        remaining_missing = data_processed.isnull().sum().sum()
        if remaining_missing == 0:
            logger.info("✓ All missing values successfully handled")
        else:
            logger.warning(f"⚠ {remaining_missing} missing values remain")
            # Additional handling of remaining missing values
            data_processed = data_processed.fillna(method='ffill').fillna(method='bfill')
            remaining_after = data_processed.isnull().sum().sum()
            if remaining_after == 0:
                logger.info("✓ Remaining missing values handled with ffill/bfill combination")
        
        return data_processed
    
    def _apply_imputation_method(
        self, 
        series: pd.Series, 
        method: str,
        **kwargs
    ) -> Tuple[pd.Series, str]:
        """
        Apply imputation method to individual series
        
        Parameters:
        -----------
        series : pd.Series
            Input series
        method : str
            Imputation method
        **kwargs : dict
            Additional parameters
        
        Returns:
        --------
        Tuple[pd.Series, str]
            Processed series and method description
        """
        if method == 'interpolate':
            # Interpolation for time series
            if isinstance(series.index, pd.DatetimeIndex):
                method_name = f"{kwargs.get('interpolation_method', 'linear')} interpolation"
                series_filled = series.interpolate(
                    method=kwargs.get('interpolation_method', 'linear'),
                    limit_direction=kwargs.get('limit_direction', 'both'),
                    limit=kwargs.get('limit', None)
                )
            else:
                method_name = 'linear interpolation'
                series_filled = series.interpolate(method='linear')
        
        elif method == 'time_weighted':
            # Time-weighted interpolation
            method_name = 'time-weighted interpolation'
            series_filled = self._time_weighted_interpolation(series)
        
        elif method == 'seasonal':
            # Seasonal interpolation
            method_name = 'seasonal interpolation'
            series_filled = self._seasonal_interpolation(series, **kwargs)
        
        elif method == 'ffill':
            # Forward fill
            method_name = 'forward fill'
            series_filled = series.ffill(limit=kwargs.get('limit', None))
        
        elif method == 'bfill':
            # Backward fill
            method_name = 'backward fill'
            series_filled = series.bfill(limit=kwargs.get('limit', None))
        
        elif method == 'mean':
            # Mean imputation
            method_name = 'mean imputation'
            series_filled = series.fillna(series.mean())
        
        elif method == 'median':
            # Median imputation
            method_name = 'median imputation'
            series_filled = series.fillna(series.median())
        
        elif method == 'mode':
            # Mode imputation
            method_name = 'mode imputation'
            mode_value = series.mode()
            if not mode_value.empty:
                series_filled = series.fillna(mode_value.iloc[0])
            else:
                series_filled = series.fillna(series.median())
        
        elif method == 'knn':
            # KNN imputation
            method_name = f"KNN imputation (k={kwargs.get('k', 5)})"
            # Simplified version using nearest neighbour mean
            series_filled = self._knn_imputation(series, k=kwargs.get('k', 5))
        
        elif method == 'regression':
            # Regression imputation
            method_name = 'regression imputation'
            series_filled = self._regression_imputation(series, **kwargs)
        
        elif method == 'spline':
            # Spline interpolation
            method_name = 'spline interpolation'
            series_filled = series.interpolate(method='spline', order=kwargs.get('order', 3))
        
        elif method == 'stl':
            # STL decomposition + interpolation
            method_name = 'STL-based imputation'
            series_filled = self._stl_imputation(series, **kwargs)
        
        else:
            raise ValueError(f"Unknown method: {method}")
        
        # If missing values remain, fill with ffill/bfill
        if series_filled.isnull().any():
            series_filled = series_filled.ffill().bfill()
            method_name += " + ffill/bfill"
        
        return series_filled, method_name
    
    def _time_weighted_interpolation(self, series: pd.Series) -> pd.Series:
        """Time-weighted interpolation"""
        if not isinstance(series.index, pd.DatetimeIndex):
            return series.interpolate()
        
        # Create timestamps
        time_numeric = pd.Series(range(len(series)), index=series.index)
        
        # Interpolate timestamps for missing values
        time_interpolated = time_numeric.interpolate()
        
        # Interpolate values based on timestamps
        valid_mask = series.notna()
        if valid_mask.sum() < 2:
            return series.ffill().bfill()
        
        # Use linear interpolation
        valid_times = time_numeric[valid_mask]
        valid_values = series[valid_mask]
        
        # Interpolation
        interp_func = interp1d(
            valid_times, 
            valid_values, 
            kind='linear',
            bounds_error=False,
            fill_value='extrapolate'
        )
        
        series_filled = series.copy()
        missing_mask = series.isna()
        series_filled[missing_mask] = interp_func(time_interpolated[missing_mask])
        
        return series_filled
    
    def _seasonal_interpolation(
        self, 
        series: pd.Series, 
        **kwargs
    ) -> pd.Series:
        """Seasonal interpolation"""
        if not isinstance(series.index, pd.DatetimeIndex):
            return series.interpolate()
        
        period = kwargs.get('period', self.config.seasonal_period)
        
        # Create series copy
        series_filled = series.copy()
        
        # Interpolation considering seasonality
        for i in range(len(series)):
            if pd.isna(series.iloc[i]):
                # Find values at same seasonal position
                seasonal_indices = []
                for offset in range(1, 10):  # Look in previous/next cycles
                    idx_back = i - offset * period
                    idx_forward = i + offset * period
                    
                    if idx_back >= 0 and not pd.isna(series.iloc[idx_back]):
                        seasonal_indices.append(idx_back)
                    
                    if idx_forward < len(series) and not pd.isna(series.iloc[idx_forward]):
                        seasonal_indices.append(idx_forward)
                
                if seasonal_indices:
                    # Take mean value from seasonal positions
                    seasonal_values = series.iloc[seasonal_indices]
                    series_filled.iloc[i] = seasonal_values.mean()
        
        # Fill remaining missing values with regular interpolation
        series_filled = series_filled.interpolate()
        
        return series_filled
    
    def _knn_imputation(
        self, 
        series: pd.Series, 
        k: int = 5
    ) -> pd.Series:
        """KNN imputation for time series"""
        # Simplified KNN for time series
        series_filled = series.copy()
        
        for i in range(len(series)):
            if pd.isna(series.iloc[i]):
                # Find nearest k non-missing values
                distances = []
                values = []
                
                for j in range(max(0, i - k * 10), min(len(series), i + k * 10)):
                    if j != i and not pd.isna(series.iloc[j]):
                        distance = abs(i - j)
                        distances.append(distance)
                        values.append(series.iloc[j])
                        
                        if len(values) >= k:
                            break
                
                if values:
                    # Distance-weighted average
                    weights = [1 / (d + 1) for d in distances]
                    weighted_avg = np.average(values, weights=weights)
                    series_filled.iloc[i] = weighted_avg
        
        return series_filled
    
    def _regression_imputation(
        self, 
        series: pd.Series, 
        **kwargs
    ) -> pd.Series:
        """Regression imputation based on neighbouring values"""
        # Simplified regression for time series
        series_filled = series.copy()
        
        if series.notna().sum() < 3:
            return series.ffill().bfill()
        
        # Use polynomial regression
        x = np.arange(len(series))
        y = series.values
        
        # Valid values mask
        valid_mask = ~np.isnan(y)
        
        if valid_mask.sum() < 2:
            return series.ffill().bfill()
        
        # Polynomial regression degree 2
        coeffs = np.polyfit(x[valid_mask], y[valid_mask], 2)
        poly_func = np.poly1d(coeffs)
        
        # Fill missing values
        missing_mask = np.isnan(y)
        series_filled.iloc[missing_mask] = poly_func(x[missing_mask])
        
        return series_filled
    
    def _stl_imputation(
        self, 
        series: pd.Series, 
        **kwargs
    ) -> pd.Series:
        """STL decomposition-based imputation"""
        try:
            if not isinstance(series.index, pd.DatetimeIndex):
                return series.interpolate()
            
            # STL decomposition
            stl = STL(
                series.ffill().bfill(),  # Fill missing for STL
                period=kwargs.get('period', self.config.seasonal_period),
                robust=True
            )
            result = stl.fit()
            
            # Reconstruct series without noise
            reconstructed = result.trend + result.seasonal
            
            # Replace missing values with reconstructed values
            series_filled = series.copy()
            missing_mask = series.isna()
            series_filled[missing_mask] = reconstructed[missing_mask]
            
            return series_filled
            
        except Exception as e:
            logger.warning(f"STL imputation failed: {e}, using interpolation")
            return series.interpolate()
    
    def _handle_rowwise(
        self, 
        data: pd.DataFrame, 
        method: str,
        **kwargs
    ) -> pd.DataFrame:
        """Row-wise missing value handling"""
        data_processed = data.copy()
        
        # Remove rows with high missing counts
        if kwargs.get('drop_rows_threshold', 0) > 0:
            threshold = kwargs['drop_rows_threshold']
            rows_before = len(data_processed)
            missing_per_row = data_processed.isnull().sum(axis=1) / data_processed.shape[1] * 100
            rows_to_drop = missing_per_row[missing_per_row > threshold].index
            data_processed = data_processed.drop(rows_to_drop)
            rows_after = len(data_processed)
            logger.info(f"Rows removed: {rows_before - rows_after} (missing > {threshold}%)")
        
        # Row-wise imputation
        if method == 'row_mean':
            data_processed = data_processed.T.fillna(data_processed.mean(axis=1)).T
        elif method == 'row_median':
            data_processed = data_processed.T.fillna(data_processed.median(axis=1)).T
        elif method == 'row_ffill':
            data_processed = data_processed.ffill(axis=1).bfill(axis=1)
        
        return data_processed
    
    def create_validation_rules(self) -> Dict:
        """Create validation rules based on missing value analysis"""
        rules = {}
        
        for col, info in self.missing_info['summary'].items():
            missing_percent = info['missing_percent']
            
            if missing_percent > 50:
                rules[col] = {
                    'action': 'drop_column',
                    'reason': f'Missing > 50%: {missing_percent:.1f}%'
                }
            elif missing_percent > 20:
                rules[col] = {
                    'action': 'advanced_imputation',
                    'reason': f'High missing: {missing_percent:.1f}%',
                    'recommended_method': 'knn'
                }
            elif missing_percent > 5:
                rules[col] = {
                    'action': 'standard_imputation',
                    'reason': f'Moderate missing: {missing_percent:.1f}%',
                    'recommended_method': 'interpolate'
                }
            elif missing_percent > 0:
                rules[col] = {
                    'action': 'simple_imputation',
                    'reason': f'Low missing: {missing_percent:.1f}%',
                    'recommended_method': 'ffill'
                }
        
        return rules
    
    def get_report(self) -> Dict:
        """Get missing values report"""
        return {
            'missing_info': self.missing_info,
            'handling_methods': self.handling_methods,
            'missing_patterns': self.missing_patterns,
            'validation_rules': self.create_validation_rules()
        }