Spaces:

mohsinbhatti
/

insightgenai

Sleeping

File size: 11,363 Bytes

e478478

"""
Data Loader Module - InsightGenAI
================================
Handles CSV upload, data validation, missing value analysis,
and automatic column type detection.

Author: InsightGenAI Team
Version: 1.0.0
"""

import pandas as pd
import numpy as np
from typing import Dict, Tuple, Optional, List
import streamlit as st


class DataLoader:
    """
    A class to handle all data loading and validation operations.
    
    Attributes:
        df (pd.DataFrame): The loaded dataset
        file_name (str): Name of the uploaded file
        column_types (Dict): Dictionary mapping columns to their detected types
    """
    
    def __init__(self):
        """Initialize the DataLoader with empty attributes."""
        self.df: Optional[pd.DataFrame] = None
        self.file_name: str = ""
        self.column_types: Dict[str, str] = {}
        self.missing_summary: Dict = {}
        
    def load_csv(self, uploaded_file) -> Tuple[bool, str]:
        """
        Load and validate a CSV file.
        
        Args:
            uploaded_file: Streamlit uploaded file object
            
        Returns:
            Tuple[bool, str]: (Success status, Message)
        """
        try:
            self.file_name = uploaded_file.name
            self.df = pd.read_csv(uploaded_file)
            
            # Basic validation
            if self.df.empty:
                return False, "The uploaded file is empty."
            
            if len(self.df.columns) < 2:
                return False, "Dataset must have at least 2 columns (features + target)."
            
            # Detect column types
            self._detect_column_types()
            
            # Generate missing value summary
            self._generate_missing_summary()
            
            return True, f"Successfully loaded {self.file_name} with {len(self.df)} rows and {len(self.df.columns)} columns."
            
        except pd.errors.EmptyDataError:
            return False, "The uploaded file is empty."
        except pd.errors.ParserError:
            return False, "Error parsing CSV file. Please check the file format."
        except Exception as e:
            return False, f"Error loading file: {str(e)}"
    
    def _detect_column_types(self) -> None:
        """
        Automatically detect the type of each column.
        
        Detected types:
            - numeric: Integer or float columns
            - categorical: Object/category columns with low cardinality
            - text: Object columns with high cardinality (potential text data)
            - datetime: Columns that can be parsed as dates
            - boolean: Columns with binary values
        """
        if self.df is None:
            return
        
        for col in self.df.columns:
            # Check for datetime
            if self.df[col].dtype == 'object':
                try:
                    pd.to_datetime(self.df[col], errors='raise')
                    self.column_types[col] = 'datetime'
                    continue
                except:
                    pass
            
            # Check for numeric
            if pd.api.types.is_numeric_dtype(self.df[col]):
                # Check if it's boolean (0/1 or True/False)
                unique_vals = self.df[col].dropna().unique()
                if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, True, False}):
                    self.column_types[col] = 'boolean'
                else:
                    self.column_types[col] = 'numeric'
            
            # Check for categorical vs text
            elif self.df[col].dtype == 'object':
                unique_count = self.df[col].nunique()
                total_count = len(self.df[col])
                
                # If unique values are less than 10% of total, it's categorical
                if unique_count / total_count < 0.1 and unique_count < 50:
                    self.column_types[col] = 'categorical'
                else:
                    # Check average string length for text detection
                    avg_length = self.df[col].dropna().astype(str).str.len().mean()
                    if avg_length > 20:
                        self.column_types[col] = 'text'
                    else:
                        self.column_types[col] = 'categorical'
            
            # Check for boolean
            elif self.df[col].dtype == 'bool':
                self.column_types[col] = 'boolean'
            
            else:
                self.column_types[col] = 'other'
    
    def _generate_missing_summary(self) -> None:
        """Generate a summary of missing values in the dataset."""
        if self.df is None:
            return
        
        missing_counts = self.df.isnull().sum()
        missing_percent = (missing_counts / len(self.df)) * 100
        
        self.missing_summary = {
            'total_rows': len(self.df),
            'total_columns': len(self.df.columns),
            'columns_with_missing': missing_counts[missing_counts > 0].to_dict(),
            'missing_percentages': missing_percent[missing_percent > 0].to_dict(),
            'total_missing': missing_counts.sum(),
            'complete_rows': len(self.df.dropna())
        }
    
    def get_dataframe(self) -> Optional[pd.DataFrame]:
        """Return the loaded dataframe."""
        return self.df
    
    def get_column_types(self) -> Dict[str, str]:
        """Return the detected column types."""
        return self.column_types
    
    def get_missing_summary(self) -> Dict:
        """Return the missing value summary."""
        return self.missing_summary
    
    def get_numeric_columns(self) -> List[str]:
        """Return list of numeric column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'numeric']
    
    def get_categorical_columns(self) -> List[str]:
        """Return list of categorical column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'categorical']
    
    def get_text_columns(self) -> List[str]:
        """Return list of text column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'text']
    
    def get_datetime_columns(self) -> List[str]:
        """Return list of datetime column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'datetime']
    
    def get_basic_stats(self) -> Dict:
        """
        Return basic statistics about the dataset.
        
        Returns:
            Dict containing dataset statistics
        """
        if self.df is None:
            return {}
        
        return {
            'shape': self.df.shape,
            'memory_usage': self.df.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
            'duplicates': self.df.duplicated().sum(),
            'column_types_count': pd.Series(self.column_types).value_counts().to_dict()
        }
    
    def suggest_target_column(self) -> Optional[str]:
        """
        Suggest a potential target column based on heuristics.
        
        Returns:
            str: Suggested target column name or None
        """
        if self.df is None:
            return None
        
        # Common target column names
        target_patterns = ['target', 'label', 'class', 'y', 'output', 'result', 
                          'prediction', 'category', 'type', 'grade', 'score']
        
        # First, look for columns matching common target patterns
        for col in self.df.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in target_patterns):
                return col
        
        # If no pattern match, suggest the last column (common convention)
        return self.df.columns[-1]
    
    def clean_data(self, handle_missing: str = 'drop', 
                   outlier_method: Optional[str] = None) -> pd.DataFrame:
        """
        Clean the dataset based on specified parameters.
        
        Args:
            handle_missing: How to handle missing values ('drop', 'mean', 'median', 'mode')
            outlier_method: Method for outlier detection ('iqr', 'zscore', None)
            
        Returns:
            pd.DataFrame: Cleaned dataframe
        """
        if self.df is None:
            raise ValueError("No data loaded. Please load data first.")
        
        df_clean = self.df.copy()
        
        # Handle missing values
        if handle_missing == 'drop':
            df_clean = df_clean.dropna()
        elif handle_missing == 'mean':
            numeric_cols = self.get_numeric_columns()
            df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
        elif handle_missing == 'median':
            numeric_cols = self.get_numeric_columns()
            df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
        elif handle_missing == 'mode':
            df_clean = df_clean.fillna(df_clean.mode().iloc[0])
        
        # Handle outliers
        if outlier_method == 'iqr':
            numeric_cols = self.get_numeric_columns()
            for col in numeric_cols:
                Q1 = df_clean[col].quantile(0.25)
                Q3 = df_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        
        elif outlier_method == 'zscore':
            from scipy import stats
            numeric_cols = self.get_numeric_columns()
            z_scores = np.abs(stats.zscore(df_clean[numeric_cols]))
            df_clean = df_clean[(z_scores < 3).all(axis=1)]
        
        return df_clean


# Utility functions for Streamlit integration
def display_data_summary(data_loader: DataLoader):
    """
    Display a summary of the loaded data in Streamlit.
    
    Args:
        data_loader: Instance of DataLoader with loaded data
    """
    if data_loader.df is None:
        st.warning("No data loaded yet.")
        return
    
    # Basic info
    stats = data_loader.get_basic_stats()
    
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Rows", stats['shape'][0])
    with col2:
        st.metric("Columns", stats['shape'][1])
    with col3:
        st.metric("Duplicates", stats['duplicates'])
    with col4:
        st.metric("Memory (MB)", f"{stats['memory_usage']:.2f}")
    
    # Column types
    st.subheader("Column Types")
    type_df = pd.DataFrame(list(data_loader.column_types.items()), 
                           columns=['Column', 'Type'])
    st.dataframe(type_df, use_container_width=True)
    
    # Missing values
    if data_loader.missing_summary['columns_with_missing']:
        st.subheader("Missing Values")
        missing_df = pd.DataFrame({
            'Column': list(data_loader.missing_summary['missing_percentages'].keys()),
            'Missing Count': list(data_loader.missing_summary['columns_with_missing'].values()),
            'Missing %': [f"{v:.2f}%" for v in data_loader.missing_summary['missing_percentages'].values()]
        })
        st.dataframe(missing_df, use_container_width=True)
    else:
        st.success("No missing values found! 🎉")