File size: 11,363 Bytes
e478478
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
Data Loader Module - InsightGenAI
================================
Handles CSV upload, data validation, missing value analysis,
and automatic column type detection.

Author: InsightGenAI Team
Version: 1.0.0
"""

import pandas as pd
import numpy as np
from typing import Dict, Tuple, Optional, List
import streamlit as st


class DataLoader:
    """
    A class to handle all data loading and validation operations.
    
    Attributes:
        df (pd.DataFrame): The loaded dataset
        file_name (str): Name of the uploaded file
        column_types (Dict): Dictionary mapping columns to their detected types
    """
    
    def __init__(self):
        """Initialize the DataLoader with empty attributes."""
        self.df: Optional[pd.DataFrame] = None
        self.file_name: str = ""
        self.column_types: Dict[str, str] = {}
        self.missing_summary: Dict = {}
        
    def load_csv(self, uploaded_file) -> Tuple[bool, str]:
        """
        Load and validate a CSV file.
        
        Args:
            uploaded_file: Streamlit uploaded file object
            
        Returns:
            Tuple[bool, str]: (Success status, Message)
        """
        try:
            self.file_name = uploaded_file.name
            self.df = pd.read_csv(uploaded_file)
            
            # Basic validation
            if self.df.empty:
                return False, "The uploaded file is empty."
            
            if len(self.df.columns) < 2:
                return False, "Dataset must have at least 2 columns (features + target)."
            
            # Detect column types
            self._detect_column_types()
            
            # Generate missing value summary
            self._generate_missing_summary()
            
            return True, f"Successfully loaded {self.file_name} with {len(self.df)} rows and {len(self.df.columns)} columns."
            
        except pd.errors.EmptyDataError:
            return False, "The uploaded file is empty."
        except pd.errors.ParserError:
            return False, "Error parsing CSV file. Please check the file format."
        except Exception as e:
            return False, f"Error loading file: {str(e)}"
    
    def _detect_column_types(self) -> None:
        """
        Automatically detect the type of each column.
        
        Detected types:
            - numeric: Integer or float columns
            - categorical: Object/category columns with low cardinality
            - text: Object columns with high cardinality (potential text data)
            - datetime: Columns that can be parsed as dates
            - boolean: Columns with binary values
        """
        if self.df is None:
            return
        
        for col in self.df.columns:
            # Check for datetime
            if self.df[col].dtype == 'object':
                try:
                    pd.to_datetime(self.df[col], errors='raise')
                    self.column_types[col] = 'datetime'
                    continue
                except:
                    pass
            
            # Check for numeric
            if pd.api.types.is_numeric_dtype(self.df[col]):
                # Check if it's boolean (0/1 or True/False)
                unique_vals = self.df[col].dropna().unique()
                if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, True, False}):
                    self.column_types[col] = 'boolean'
                else:
                    self.column_types[col] = 'numeric'
            
            # Check for categorical vs text
            elif self.df[col].dtype == 'object':
                unique_count = self.df[col].nunique()
                total_count = len(self.df[col])
                
                # If unique values are less than 10% of total, it's categorical
                if unique_count / total_count < 0.1 and unique_count < 50:
                    self.column_types[col] = 'categorical'
                else:
                    # Check average string length for text detection
                    avg_length = self.df[col].dropna().astype(str).str.len().mean()
                    if avg_length > 20:
                        self.column_types[col] = 'text'
                    else:
                        self.column_types[col] = 'categorical'
            
            # Check for boolean
            elif self.df[col].dtype == 'bool':
                self.column_types[col] = 'boolean'
            
            else:
                self.column_types[col] = 'other'
    
    def _generate_missing_summary(self) -> None:
        """Generate a summary of missing values in the dataset."""
        if self.df is None:
            return
        
        missing_counts = self.df.isnull().sum()
        missing_percent = (missing_counts / len(self.df)) * 100
        
        self.missing_summary = {
            'total_rows': len(self.df),
            'total_columns': len(self.df.columns),
            'columns_with_missing': missing_counts[missing_counts > 0].to_dict(),
            'missing_percentages': missing_percent[missing_percent > 0].to_dict(),
            'total_missing': missing_counts.sum(),
            'complete_rows': len(self.df.dropna())
        }
    
    def get_dataframe(self) -> Optional[pd.DataFrame]:
        """Return the loaded dataframe."""
        return self.df
    
    def get_column_types(self) -> Dict[str, str]:
        """Return the detected column types."""
        return self.column_types
    
    def get_missing_summary(self) -> Dict:
        """Return the missing value summary."""
        return self.missing_summary
    
    def get_numeric_columns(self) -> List[str]:
        """Return list of numeric column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'numeric']
    
    def get_categorical_columns(self) -> List[str]:
        """Return list of categorical column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'categorical']
    
    def get_text_columns(self) -> List[str]:
        """Return list of text column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'text']
    
    def get_datetime_columns(self) -> List[str]:
        """Return list of datetime column names."""
        return [col for col, type_ in self.column_types.items() if type_ == 'datetime']
    
    def get_basic_stats(self) -> Dict:
        """
        Return basic statistics about the dataset.
        
        Returns:
            Dict containing dataset statistics
        """
        if self.df is None:
            return {}
        
        return {
            'shape': self.df.shape,
            'memory_usage': self.df.memory_usage(deep=True).sum() / (1024 * 1024),  # MB
            'duplicates': self.df.duplicated().sum(),
            'column_types_count': pd.Series(self.column_types).value_counts().to_dict()
        }
    
    def suggest_target_column(self) -> Optional[str]:
        """
        Suggest a potential target column based on heuristics.
        
        Returns:
            str: Suggested target column name or None
        """
        if self.df is None:
            return None
        
        # Common target column names
        target_patterns = ['target', 'label', 'class', 'y', 'output', 'result', 
                          'prediction', 'category', 'type', 'grade', 'score']
        
        # First, look for columns matching common target patterns
        for col in self.df.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in target_patterns):
                return col
        
        # If no pattern match, suggest the last column (common convention)
        return self.df.columns[-1]
    
    def clean_data(self, handle_missing: str = 'drop', 
                   outlier_method: Optional[str] = None) -> pd.DataFrame:
        """
        Clean the dataset based on specified parameters.
        
        Args:
            handle_missing: How to handle missing values ('drop', 'mean', 'median', 'mode')
            outlier_method: Method for outlier detection ('iqr', 'zscore', None)
            
        Returns:
            pd.DataFrame: Cleaned dataframe
        """
        if self.df is None:
            raise ValueError("No data loaded. Please load data first.")
        
        df_clean = self.df.copy()
        
        # Handle missing values
        if handle_missing == 'drop':
            df_clean = df_clean.dropna()
        elif handle_missing == 'mean':
            numeric_cols = self.get_numeric_columns()
            df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
        elif handle_missing == 'median':
            numeric_cols = self.get_numeric_columns()
            df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
        elif handle_missing == 'mode':
            df_clean = df_clean.fillna(df_clean.mode().iloc[0])
        
        # Handle outliers
        if outlier_method == 'iqr':
            numeric_cols = self.get_numeric_columns()
            for col in numeric_cols:
                Q1 = df_clean[col].quantile(0.25)
                Q3 = df_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
        
        elif outlier_method == 'zscore':
            from scipy import stats
            numeric_cols = self.get_numeric_columns()
            z_scores = np.abs(stats.zscore(df_clean[numeric_cols]))
            df_clean = df_clean[(z_scores < 3).all(axis=1)]
        
        return df_clean


# Utility functions for Streamlit integration
def display_data_summary(data_loader: DataLoader):
    """
    Display a summary of the loaded data in Streamlit.
    
    Args:
        data_loader: Instance of DataLoader with loaded data
    """
    if data_loader.df is None:
        st.warning("No data loaded yet.")
        return
    
    # Basic info
    stats = data_loader.get_basic_stats()
    
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Rows", stats['shape'][0])
    with col2:
        st.metric("Columns", stats['shape'][1])
    with col3:
        st.metric("Duplicates", stats['duplicates'])
    with col4:
        st.metric("Memory (MB)", f"{stats['memory_usage']:.2f}")
    
    # Column types
    st.subheader("Column Types")
    type_df = pd.DataFrame(list(data_loader.column_types.items()), 
                           columns=['Column', 'Type'])
    st.dataframe(type_df, use_container_width=True)
    
    # Missing values
    if data_loader.missing_summary['columns_with_missing']:
        st.subheader("Missing Values")
        missing_df = pd.DataFrame({
            'Column': list(data_loader.missing_summary['missing_percentages'].keys()),
            'Missing Count': list(data_loader.missing_summary['columns_with_missing'].values()),
            'Missing %': [f"{v:.2f}%" for v in data_loader.missing_summary['missing_percentages'].values()]
        })
        st.dataframe(missing_df, use_container_width=True)
    else:
        st.success("No missing values found! 🎉")