Spaces:

mohsinbhatti
/

insightgenai

Sleeping

App Files Files Community

insightgenai / modules /data_loader.py

mohsinbhatti

Initial commit - InsightGenAI files

e478478 18 days ago

raw

history blame contribute delete

11.4 kB

	"""
	Data Loader Module - InsightGenAI
	================================
	Handles CSV upload, data validation, missing value analysis,
	and automatic column type detection.

	Author: InsightGenAI Team
	Version: 1.0.0
	"""

	import pandas as pd
	import numpy as np
	from typing import Dict, Tuple, Optional, List
	import streamlit as st


	class DataLoader:
	"""
	A class to handle all data loading and validation operations.

	Attributes:
	df (pd.DataFrame): The loaded dataset
	file_name (str): Name of the uploaded file
	column_types (Dict): Dictionary mapping columns to their detected types
	"""

	def __init__(self):
	"""Initialize the DataLoader with empty attributes."""
	self.df: Optional[pd.DataFrame] = None
	self.file_name: str = ""
	self.column_types: Dict[str, str] = {}
	self.missing_summary: Dict = {}

	def load_csv(self, uploaded_file) -> Tuple[bool, str]:
	"""
	Load and validate a CSV file.

	Args:
	uploaded_file: Streamlit uploaded file object

	Returns:
	Tuple[bool, str]: (Success status, Message)
	"""
	try:
	self.file_name = uploaded_file.name
	self.df = pd.read_csv(uploaded_file)

	# Basic validation
	if self.df.empty:
	return False, "The uploaded file is empty."

	if len(self.df.columns) < 2:
	return False, "Dataset must have at least 2 columns (features + target)."

	# Detect column types
	self._detect_column_types()

	# Generate missing value summary
	self._generate_missing_summary()

	return True, f"Successfully loaded {self.file_name} with {len(self.df)} rows and {len(self.df.columns)} columns."

	except pd.errors.EmptyDataError:
	return False, "The uploaded file is empty."
	except pd.errors.ParserError:
	return False, "Error parsing CSV file. Please check the file format."
	except Exception as e:
	return False, f"Error loading file: {str(e)}"

	def _detect_column_types(self) -> None:
	"""
	Automatically detect the type of each column.

	Detected types:
	- numeric: Integer or float columns
	- categorical: Object/category columns with low cardinality
	- text: Object columns with high cardinality (potential text data)
	- datetime: Columns that can be parsed as dates
	- boolean: Columns with binary values
	"""
	if self.df is None:
	return

	for col in self.df.columns:
	# Check for datetime
	if self.df[col].dtype == 'object':
	try:
	pd.to_datetime(self.df[col], errors='raise')
	self.column_types[col] = 'datetime'
	continue
	except:
	pass

	# Check for numeric
	if pd.api.types.is_numeric_dtype(self.df[col]):
	# Check if it's boolean (0/1 or True/False)
	unique_vals = self.df[col].dropna().unique()
	if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1, True, False}):
	self.column_types[col] = 'boolean'
	else:
	self.column_types[col] = 'numeric'

	# Check for categorical vs text
	elif self.df[col].dtype == 'object':
	unique_count = self.df[col].nunique()
	total_count = len(self.df[col])

	# If unique values are less than 10% of total, it's categorical
	if unique_count / total_count < 0.1 and unique_count < 50:
	self.column_types[col] = 'categorical'
	else:
	# Check average string length for text detection
	avg_length = self.df[col].dropna().astype(str).str.len().mean()
	if avg_length > 20:
	self.column_types[col] = 'text'
	else:
	self.column_types[col] = 'categorical'

	# Check for boolean
	elif self.df[col].dtype == 'bool':
	self.column_types[col] = 'boolean'

	else:
	self.column_types[col] = 'other'

	def _generate_missing_summary(self) -> None:
	"""Generate a summary of missing values in the dataset."""
	if self.df is None:
	return

	missing_counts = self.df.isnull().sum()
	missing_percent = (missing_counts / len(self.df)) * 100

	self.missing_summary = {
	'total_rows': len(self.df),
	'total_columns': len(self.df.columns),
	'columns_with_missing': missing_counts[missing_counts > 0].to_dict(),
	'missing_percentages': missing_percent[missing_percent > 0].to_dict(),
	'total_missing': missing_counts.sum(),
	'complete_rows': len(self.df.dropna())
	}

	def get_dataframe(self) -> Optional[pd.DataFrame]:
	"""Return the loaded dataframe."""
	return self.df

	def get_column_types(self) -> Dict[str, str]:
	"""Return the detected column types."""
	return self.column_types

	def get_missing_summary(self) -> Dict:
	"""Return the missing value summary."""
	return self.missing_summary

	def get_numeric_columns(self) -> List[str]:
	"""Return list of numeric column names."""
	return [col for col, type_ in self.column_types.items() if type_ == 'numeric']

	def get_categorical_columns(self) -> List[str]:
	"""Return list of categorical column names."""
	return [col for col, type_ in self.column_types.items() if type_ == 'categorical']

	def get_text_columns(self) -> List[str]:
	"""Return list of text column names."""
	return [col for col, type_ in self.column_types.items() if type_ == 'text']

	def get_datetime_columns(self) -> List[str]:
	"""Return list of datetime column names."""
	return [col for col, type_ in self.column_types.items() if type_ == 'datetime']

	def get_basic_stats(self) -> Dict:
	"""
	Return basic statistics about the dataset.

	Returns:
	Dict containing dataset statistics
	"""
	if self.df is None:
	return {}

	return {
	'shape': self.df.shape,
	'memory_usage': self.df.memory_usage(deep=True).sum() / (1024 * 1024), # MB
	'duplicates': self.df.duplicated().sum(),
	'column_types_count': pd.Series(self.column_types).value_counts().to_dict()
	}

	def suggest_target_column(self) -> Optional[str]:
	"""
	Suggest a potential target column based on heuristics.

	Returns:
	str: Suggested target column name or None
	"""
	if self.df is None:
	return None

	# Common target column names
	target_patterns = ['target', 'label', 'class', 'y', 'output', 'result',
	'prediction', 'category', 'type', 'grade', 'score']

	# First, look for columns matching common target patterns
	for col in self.df.columns:
	col_lower = col.lower()
	if any(pattern in col_lower for pattern in target_patterns):
	return col

	# If no pattern match, suggest the last column (common convention)
	return self.df.columns[-1]

	def clean_data(self, handle_missing: str = 'drop',
	outlier_method: Optional[str] = None) -> pd.DataFrame:
	"""
	Clean the dataset based on specified parameters.

	Args:
	handle_missing: How to handle missing values ('drop', 'mean', 'median', 'mode')
	outlier_method: Method for outlier detection ('iqr', 'zscore', None)

	Returns:
	pd.DataFrame: Cleaned dataframe
	"""
	if self.df is None:
	raise ValueError("No data loaded. Please load data first.")

	df_clean = self.df.copy()

	# Handle missing values
	if handle_missing == 'drop':
	df_clean = df_clean.dropna()
	elif handle_missing == 'mean':
	numeric_cols = self.get_numeric_columns()
	df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
	elif handle_missing == 'median':
	numeric_cols = self.get_numeric_columns()
	df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
	elif handle_missing == 'mode':
	df_clean = df_clean.fillna(df_clean.mode().iloc[0])

	# Handle outliers
	if outlier_method == 'iqr':
	numeric_cols = self.get_numeric_columns()
	for col in numeric_cols:
	Q1 = df_clean[col].quantile(0.25)
	Q3 = df_clean[col].quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

	elif outlier_method == 'zscore':
	from scipy import stats
	numeric_cols = self.get_numeric_columns()
	z_scores = np.abs(stats.zscore(df_clean[numeric_cols]))
	df_clean = df_clean[(z_scores < 3).all(axis=1)]

	return df_clean


	# Utility functions for Streamlit integration
	def display_data_summary(data_loader: DataLoader):
	"""
	Display a summary of the loaded data in Streamlit.

	Args:
	data_loader: Instance of DataLoader with loaded data
	"""
	if data_loader.df is None:
	st.warning("No data loaded yet.")
	return

	# Basic info
	stats = data_loader.get_basic_stats()

	col1, col2, col3, col4 = st.columns(4)
	with col1:
	st.metric("Rows", stats['shape'][0])
	with col2:
	st.metric("Columns", stats['shape'][1])
	with col3:
	st.metric("Duplicates", stats['duplicates'])
	with col4:
	st.metric("Memory (MB)", f"{stats['memory_usage']:.2f}")

	# Column types
	st.subheader("Column Types")
	type_df = pd.DataFrame(list(data_loader.column_types.items()),
	columns=['Column', 'Type'])
	st.dataframe(type_df, use_container_width=True)

	# Missing values
	if data_loader.missing_summary['columns_with_missing']:
	st.subheader("Missing Values")
	missing_df = pd.DataFrame({
	'Column': list(data_loader.missing_summary['missing_percentages'].keys()),
	'Missing Count': list(data_loader.missing_summary['columns_with_missing'].values()),
	'Missing %': [f"{v:.2f}%" for v in data_loader.missing_summary['missing_percentages'].values()]
	})
	st.dataframe(missing_df, use_container_width=True)
	else:
	st.success("No missing values found! 🎉")