insightgenai / modules /data_loader.py
mohsinbhatti's picture
Initial commit - InsightGenAI files
e478478
"""
Data Loader Module - InsightGenAI
================================
Handles CSV upload, data validation, missing value analysis,
and automatic column type detection.
Author: InsightGenAI Team
Version: 1.0.0
"""
import pandas as pd
import numpy as np
from typing import Dict, Tuple, Optional, List
import streamlit as st
class DataLoader:
"""
A class to handle all data loading and validation operations.
Attributes:
df (pd.DataFrame): The loaded dataset
file_name (str): Name of the uploaded file
column_types (Dict): Dictionary mapping columns to their detected types
"""
def __init__(self):
"""Initialize the DataLoader with empty attributes."""
self.df: Optional[pd.DataFrame] = None
self.file_name: str = ""
self.column_types: Dict[str, str] = {}
self.missing_summary: Dict = {}
def load_csv(self, uploaded_file) -> Tuple[bool, str]:
"""
Load and validate a CSV file.
Args:
uploaded_file: Streamlit uploaded file object
Returns:
Tuple[bool, str]: (Success status, Message)
"""
try:
self.file_name = uploaded_file.name
self.df = pd.read_csv(uploaded_file)
# Basic validation
if self.df.empty:
return False, "The uploaded file is empty."
if len(self.df.columns) < 2:
return False, "Dataset must have at least 2 columns (features + target)."
# Detect column types
self._detect_column_types()
# Generate missing value summary
self._generate_missing_summary()
return True, f"Successfully loaded {self.file_name} with {len(self.df)} rows and {len(self.df.columns)} columns."
except pd.errors.EmptyDataError:
return False, "The uploaded file is empty."
except pd.errors.ParserError:
return False, "Error parsing CSV file. Please check the file format."
except Exception as e:
return False, f"Error loading file: {str(e)}"
def _detect_column_types(self) -> None:
"""
Automatically detect the type of each column.
Detected types:
- numeric: Integer or float columns
- categorical: Object/category columns with low cardinality
- text: Object columns with high cardinality (potential text data)
- datetime: Columns that can be parsed as dates
- boolean: Columns with binary values
"""
if self.df is None:
return
for col in self.df.columns:
# Check for datetime
if self.df[col].dtype == 'object':
try:
pd.to_datetime(self.df[col], errors='raise')
self.column_types[col] = 'datetime'
continue
except:
pass
# Check for numeric
if pd.api.types.is_numeric_dtype(self.df[col]):
# Check if it's boolean (0/1 or True/False)
unique_vals = self.df[col].dropna().unique()
if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, True, False}):
self.column_types[col] = 'boolean'
else:
self.column_types[col] = 'numeric'
# Check for categorical vs text
elif self.df[col].dtype == 'object':
unique_count = self.df[col].nunique()
total_count = len(self.df[col])
# If unique values are less than 10% of total, it's categorical
if unique_count / total_count < 0.1 and unique_count < 50:
self.column_types[col] = 'categorical'
else:
# Check average string length for text detection
avg_length = self.df[col].dropna().astype(str).str.len().mean()
if avg_length > 20:
self.column_types[col] = 'text'
else:
self.column_types[col] = 'categorical'
# Check for boolean
elif self.df[col].dtype == 'bool':
self.column_types[col] = 'boolean'
else:
self.column_types[col] = 'other'
def _generate_missing_summary(self) -> None:
"""Generate a summary of missing values in the dataset."""
if self.df is None:
return
missing_counts = self.df.isnull().sum()
missing_percent = (missing_counts / len(self.df)) * 100
self.missing_summary = {
'total_rows': len(self.df),
'total_columns': len(self.df.columns),
'columns_with_missing': missing_counts[missing_counts > 0].to_dict(),
'missing_percentages': missing_percent[missing_percent > 0].to_dict(),
'total_missing': missing_counts.sum(),
'complete_rows': len(self.df.dropna())
}
def get_dataframe(self) -> Optional[pd.DataFrame]:
"""Return the loaded dataframe."""
return self.df
def get_column_types(self) -> Dict[str, str]:
"""Return the detected column types."""
return self.column_types
def get_missing_summary(self) -> Dict:
"""Return the missing value summary."""
return self.missing_summary
def get_numeric_columns(self) -> List[str]:
"""Return list of numeric column names."""
return [col for col, type_ in self.column_types.items() if type_ == 'numeric']
def get_categorical_columns(self) -> List[str]:
"""Return list of categorical column names."""
return [col for col, type_ in self.column_types.items() if type_ == 'categorical']
def get_text_columns(self) -> List[str]:
"""Return list of text column names."""
return [col for col, type_ in self.column_types.items() if type_ == 'text']
def get_datetime_columns(self) -> List[str]:
"""Return list of datetime column names."""
return [col for col, type_ in self.column_types.items() if type_ == 'datetime']
def get_basic_stats(self) -> Dict:
"""
Return basic statistics about the dataset.
Returns:
Dict containing dataset statistics
"""
if self.df is None:
return {}
return {
'shape': self.df.shape,
'memory_usage': self.df.memory_usage(deep=True).sum() / (1024 * 1024), # MB
'duplicates': self.df.duplicated().sum(),
'column_types_count': pd.Series(self.column_types).value_counts().to_dict()
}
def suggest_target_column(self) -> Optional[str]:
"""
Suggest a potential target column based on heuristics.
Returns:
str: Suggested target column name or None
"""
if self.df is None:
return None
# Common target column names
target_patterns = ['target', 'label', 'class', 'y', 'output', 'result',
'prediction', 'category', 'type', 'grade', 'score']
# First, look for columns matching common target patterns
for col in self.df.columns:
col_lower = col.lower()
if any(pattern in col_lower for pattern in target_patterns):
return col
# If no pattern match, suggest the last column (common convention)
return self.df.columns[-1]
def clean_data(self, handle_missing: str = 'drop',
outlier_method: Optional[str] = None) -> pd.DataFrame:
"""
Clean the dataset based on specified parameters.
Args:
handle_missing: How to handle missing values ('drop', 'mean', 'median', 'mode')
outlier_method: Method for outlier detection ('iqr', 'zscore', None)
Returns:
pd.DataFrame: Cleaned dataframe
"""
if self.df is None:
raise ValueError("No data loaded. Please load data first.")
df_clean = self.df.copy()
# Handle missing values
if handle_missing == 'drop':
df_clean = df_clean.dropna()
elif handle_missing == 'mean':
numeric_cols = self.get_numeric_columns()
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
elif handle_missing == 'median':
numeric_cols = self.get_numeric_columns()
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
elif handle_missing == 'mode':
df_clean = df_clean.fillna(df_clean.mode().iloc[0])
# Handle outliers
if outlier_method == 'iqr':
numeric_cols = self.get_numeric_columns()
for col in numeric_cols:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
elif outlier_method == 'zscore':
from scipy import stats
numeric_cols = self.get_numeric_columns()
z_scores = np.abs(stats.zscore(df_clean[numeric_cols]))
df_clean = df_clean[(z_scores < 3).all(axis=1)]
return df_clean
# Utility functions for Streamlit integration
def display_data_summary(data_loader: DataLoader):
"""
Display a summary of the loaded data in Streamlit.
Args:
data_loader: Instance of DataLoader with loaded data
"""
if data_loader.df is None:
st.warning("No data loaded yet.")
return
# Basic info
stats = data_loader.get_basic_stats()
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Rows", stats['shape'][0])
with col2:
st.metric("Columns", stats['shape'][1])
with col3:
st.metric("Duplicates", stats['duplicates'])
with col4:
st.metric("Memory (MB)", f"{stats['memory_usage']:.2f}")
# Column types
st.subheader("Column Types")
type_df = pd.DataFrame(list(data_loader.column_types.items()),
columns=['Column', 'Type'])
st.dataframe(type_df, use_container_width=True)
# Missing values
if data_loader.missing_summary['columns_with_missing']:
st.subheader("Missing Values")
missing_df = pd.DataFrame({
'Column': list(data_loader.missing_summary['missing_percentages'].keys()),
'Missing Count': list(data_loader.missing_summary['columns_with_missing'].values()),
'Missing %': [f"{v:.2f}%" for v in data_loader.missing_summary['missing_percentages'].values()]
})
st.dataframe(missing_df, use_container_width=True)
else:
st.success("No missing values found! 🎉")