Spaces:
Sleeping
Sleeping
| """ | |
| Data processing module for the Business Intelligence Dashboard. | |
| This module handles data loading, cleaning, filtering, and profiling | |
| using the Strategy Pattern for different data operations. | |
| """ | |
| from abc import ABC, abstractmethod | |
| from typing import Dict, List, Optional, Tuple, Any | |
| import pandas as pd | |
| import numpy as np | |
| from utils import detect_column_types, validate_dataframe, get_missing_value_summary | |
| from constants import MIN_NUMERICAL_COLUMNS_FOR_CORRELATION | |
| class DataLoadStrategy(ABC): | |
| """Abstract base class for data loading strategies.""" | |
| def load(self, file_path: str) -> pd.DataFrame: | |
| """ | |
| Load data from file. | |
| Args: | |
| file_path: Path to the data file | |
| Returns: | |
| Loaded DataFrame | |
| """ | |
| pass | |
| class CSVLoadStrategy(DataLoadStrategy): | |
| """Strategy for loading CSV files.""" | |
| def load(self, file_path: str) -> pd.DataFrame: | |
| """Load CSV file.""" | |
| return pd.read_csv(file_path) | |
| class ExcelLoadStrategy(DataLoadStrategy): | |
| """Strategy for loading Excel files.""" | |
| def load(self, file_path: str) -> pd.DataFrame: | |
| """Load Excel file.""" | |
| return pd.read_excel(file_path) | |
| class DataLoader: | |
| """Context class for data loading using Strategy Pattern.""" | |
| def __init__(self): | |
| """Initialize with default strategies.""" | |
| self._strategies = { | |
| '.csv': CSVLoadStrategy(), | |
| '.xlsx': ExcelLoadStrategy(), | |
| '.xls': ExcelLoadStrategy() | |
| } | |
| def load_data(self, file_path: str) -> Tuple[pd.DataFrame, Optional[str]]: | |
| """ | |
| Load data file using appropriate strategy. | |
| Args: | |
| file_path: Path to the data file | |
| Returns: | |
| Tuple of (DataFrame, error_message) | |
| """ | |
| try: | |
| import os | |
| _, ext = os.path.splitext(file_path.lower()) | |
| if ext not in self._strategies: | |
| return None, f"Unsupported file format: {ext}" | |
| strategy = self._strategies[ext] | |
| df = strategy.load(file_path) | |
| # Validate loaded data | |
| is_valid, error = validate_dataframe(df) | |
| if not is_valid: | |
| return None, error | |
| return df, None | |
| except Exception as e: | |
| return None, f"Error loading file: {str(e)}" | |
| class FilterStrategy(ABC): | |
| """Abstract base class for filtering strategies.""" | |
| def apply_filter( | |
| self, | |
| df: pd.DataFrame, | |
| column: str, | |
| filter_value: Any | |
| ) -> pd.DataFrame: | |
| """ | |
| Apply filter to DataFrame. | |
| Args: | |
| df: Input DataFrame | |
| column: Column to filter on | |
| filter_value: Filter value/range | |
| Returns: | |
| Filtered DataFrame | |
| """ | |
| pass | |
| class NumericalFilterStrategy(FilterStrategy): | |
| """Strategy for filtering numerical columns.""" | |
| def apply_filter( | |
| self, | |
| df: pd.DataFrame, | |
| column: str, | |
| filter_value: Tuple[float, float] | |
| ) -> pd.DataFrame: | |
| """Apply range filter to numerical column.""" | |
| min_val, max_val = filter_value | |
| return df[(df[column] >= min_val) & (df[column] <= max_val)] | |
| class CategoricalFilterStrategy(FilterStrategy): | |
| """Strategy for filtering categorical columns.""" | |
| def apply_filter( | |
| self, | |
| df: pd.DataFrame, | |
| column: str, | |
| filter_value: List[str] | |
| ) -> pd.DataFrame: | |
| """Apply multi-select filter to categorical column.""" | |
| if not filter_value: | |
| return df | |
| return df[df[column].isin(filter_value)] | |
| class DateFilterStrategy(FilterStrategy): | |
| """Strategy for filtering date columns.""" | |
| def apply_filter( | |
| self, | |
| df: pd.DataFrame, | |
| column: str, | |
| filter_value: Tuple[str, str] | |
| ) -> pd.DataFrame: | |
| """Apply date range filter.""" | |
| start_date, end_date = filter_value | |
| if start_date and end_date: | |
| df[column] = pd.to_datetime(df[column], errors='coerce') | |
| return df[(df[column] >= start_date) & (df[column] <= end_date)] | |
| return df | |
| class DataFilter: | |
| """Context class for data filtering using Strategy Pattern.""" | |
| def __init__(self): | |
| """Initialize with filter strategies.""" | |
| self._strategies = { | |
| 'numerical': NumericalFilterStrategy(), | |
| 'categorical': CategoricalFilterStrategy(), | |
| 'date': DateFilterStrategy() | |
| } | |
| def apply_filters( | |
| self, | |
| df: pd.DataFrame, | |
| filters: Dict[str, Any] | |
| ) -> pd.DataFrame: | |
| """ | |
| Apply multiple filters to DataFrame. | |
| Args: | |
| df: Input DataFrame | |
| filters: Dictionary of {column: filter_value} | |
| Returns: | |
| Filtered DataFrame | |
| """ | |
| filtered_df = df.copy() | |
| numerical, categorical, date_columns = detect_column_types(df) | |
| for column, filter_value in filters.items(): | |
| if filter_value is None: | |
| continue | |
| if column in numerical: | |
| strategy = self._strategies['numerical'] | |
| elif column in categorical: | |
| strategy = self._strategies['categorical'] | |
| elif column in date_columns: | |
| strategy = self._strategies['date'] | |
| else: | |
| continue | |
| try: | |
| filtered_df = strategy.apply_filter(filtered_df, column, filter_value) | |
| except Exception as e: | |
| print(f"Error applying filter to {column}: {e}") | |
| continue | |
| return filtered_df | |
| class DataProfiler: | |
| """Class for generating data profiling and statistics.""" | |
| def get_basic_info(df: pd.DataFrame) -> Dict[str, Any]: | |
| """ | |
| Get basic dataset information. | |
| Args: | |
| df: Input DataFrame | |
| Returns: | |
| Dictionary with basic info | |
| """ | |
| return { | |
| 'shape': df.shape, | |
| 'columns': list(df.columns), | |
| 'dtypes': df.dtypes.to_dict(), | |
| 'memory_usage': df.memory_usage(deep=True).sum() | |
| } | |
| def get_numerical_stats(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Get statistics for numerical columns. | |
| Args: | |
| df: Input DataFrame | |
| Returns: | |
| DataFrame with numerical statistics, with column names as a column | |
| """ | |
| numerical, _, _ = detect_column_types(df) | |
| if not numerical: | |
| return pd.DataFrame() | |
| stats = df[numerical].describe() | |
| stats.loc['median'] = df[numerical].median() | |
| stats.loc['std'] = df[numerical].std() | |
| # Transpose so column names become rows (index) | |
| stats_transposed = stats.T | |
| # Reset index to make column names a regular column for display | |
| stats_transposed = stats_transposed.reset_index() | |
| stats_transposed.rename(columns={'index': 'Column'}, inplace=True) | |
| # Reorder columns for better readability (Column first, then statistics) | |
| column_order = ['Column', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'median'] | |
| # Only include columns that exist | |
| available_columns = [col for col in column_order if col in stats_transposed.columns] | |
| stats_transposed = stats_transposed[available_columns] | |
| return stats_transposed | |
| def get_categorical_stats(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Get statistics for categorical columns. | |
| Args: | |
| df: Input DataFrame | |
| Returns: | |
| DataFrame with categorical statistics | |
| """ | |
| _, categorical, _ = detect_column_types(df) | |
| if not categorical: | |
| return pd.DataFrame() | |
| stats = [] | |
| for col in categorical: | |
| unique_count = df[col].nunique() | |
| mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None | |
| mode_count = df[col].value_counts().iloc[0] if not df[col].empty else 0 | |
| stats.append({ | |
| 'Column': col, | |
| 'Unique_Values': unique_count, | |
| 'Mode': mode_value, | |
| 'Mode_Count': mode_count, | |
| 'Total_Count': len(df) | |
| }) | |
| return pd.DataFrame(stats) | |
| def get_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Get correlation matrix for numerical columns. | |
| Args: | |
| df: Input DataFrame | |
| Returns: | |
| Correlation matrix DataFrame | |
| """ | |
| numerical, _, _ = detect_column_types(df) | |
| if len(numerical) < MIN_NUMERICAL_COLUMNS_FOR_CORRELATION: | |
| return pd.DataFrame() | |
| return df[numerical].corr() | |