entropy25 commited on
Commit
1cc5290
·
verified ·
1 Parent(s): ee51cad

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +529 -196
data_handler.py CHANGED
@@ -2,40 +2,101 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
- from typing import Dict, List, Any, Tuple
6
  from scipy import stats
 
 
7
  warnings.filterwarnings('ignore')
8
 
9
- # All cached data processing functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @st.cache_data
11
- def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
12
- """Load CSV with automatic encoding detection - cached"""
13
- import chardet
14
-
15
- detected = chardet.detect(file_content)
16
- encoding = detected['encoding']
17
-
18
  try:
 
 
 
 
19
  from io import BytesIO
20
- return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
- except:
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
23
  for enc in encodings:
24
  try:
25
- return pd.read_csv(BytesIO(file_content), encoding=enc)
 
 
 
26
  except:
27
  continue
28
- raise Exception("Cannot read file with any encoding")
 
29
 
30
  @st.cache_data
31
- def load_excel_file(file_content: bytes) -> pd.DataFrame:
32
- """Load Excel file - cached"""
 
33
  from io import BytesIO
34
- return pd.read_excel(BytesIO(file_content))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @st.cache_data
 
37
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
38
- """Calculate basic statistics - cached"""
 
 
 
39
  dtype_counts = df.dtypes.value_counts()
40
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
41
 
@@ -48,250 +109,522 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
48
  }
49
 
50
  @st.cache_data
 
51
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
52
- """Calculate column cardinality analysis - cached"""
 
 
 
53
  cardinality_data = []
54
 
55
  for col in df.columns:
56
- unique_count = df[col].nunique()
57
- unique_ratio = unique_count / len(df)
58
-
59
- # Determine column type based on cardinality
60
- if unique_count == 1:
61
- col_type = "Constant"
62
- elif unique_count == len(df):
63
- col_type = "Unique Identifier"
64
- elif unique_ratio < 0.05:
65
- col_type = "Low Cardinality"
66
- elif unique_ratio < 0.5:
67
- col_type = "Medium Cardinality"
68
- else:
69
- col_type = "High Cardinality"
70
-
71
- cardinality_data.append({
72
- 'Column': col,
73
- 'Unique Count': unique_count,
74
- 'Unique Ratio': unique_ratio,
75
- 'Type': col_type,
76
- 'Data Type': str(df[col].dtype)
77
- })
 
 
 
 
 
 
 
 
78
 
79
  return pd.DataFrame(cardinality_data)
80
 
81
  @st.cache_data
 
82
  def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
83
- """Calculate memory optimization suggestions - cached"""
 
 
 
84
  suggestions = []
85
  current_memory = df.memory_usage(deep=True).sum() / 1024**2
86
  potential_savings = 0
87
 
88
  for col in df.columns:
89
- if df[col].dtype == 'object':
90
- unique_ratio = df[col].nunique() / len(df)
91
- if unique_ratio < 0.5: # Less than 50% unique values
92
- # Estimate category memory usage
93
- category_memory = df[col].astype('category').memory_usage(deep=True)
94
- object_memory = df[col].memory_usage(deep=True)
95
- savings = (object_memory - category_memory) / 1024**2
96
 
97
- if savings > 0.1: # More than 0.1MB savings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  suggestions.append({
99
- 'column': col,
100
- 'current_type': 'object',
101
- 'suggested_type': 'category',
102
- 'savings_mb': savings
 
103
  })
104
  potential_savings += savings
 
 
 
 
105
 
106
  return {
107
  'suggestions': suggestions,
108
- 'current_memory_mb': current_memory,
109
- 'potential_savings_mb': potential_savings,
110
- 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
111
  }
112
 
113
  @st.cache_data
 
114
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
115
- """Calculate missing data analysis - cached"""
 
 
 
116
  missing_data = df.isnull().sum()
 
117
  if missing_data.sum() > 0:
118
  missing_df = pd.DataFrame({
119
  'Column': missing_data.index,
120
  'Missing Count': missing_data.values,
121
- 'Missing %': (missing_data.values / len(df)) * 100
 
122
  })
123
- return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
 
 
 
124
  return pd.DataFrame()
125
 
126
  @st.cache_data
 
127
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
128
- """Calculate correlation matrix - cached"""
 
 
 
129
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
130
- return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  @st.cache_data
 
133
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
134
- """Get column types - cached"""
135
- return {
 
 
 
136
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
137
- 'categorical': df.select_dtypes(include=['object']).columns.tolist(),
138
- 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
 
139
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  @st.cache_data
142
- def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
143
- """Calculate enhanced numeric statistics - cached"""
 
 
 
 
 
 
 
144
  series = df[column].dropna()
145
- return {
146
- 'mean': series.mean(),
147
- 'median': series.median(),
148
- 'std': series.std(),
149
- 'skewness': series.skew(),
150
- 'kurtosis': series.kurtosis(),
151
- 'min': series.min(),
152
- 'max': series.max(),
153
- 'q25': series.quantile(0.25),
154
- 'q75': series.quantile(0.75)
155
- }
156
-
157
- @st.cache_data
158
- def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
159
- """Calculate outliers using IQR method - cached"""
160
- Q1 = df[column].quantile(0.25)
161
- Q3 = df[column].quantile(0.75)
162
- IQR = Q3 - Q1
163
- lower_bound = Q1 - 1.5 * IQR
164
- upper_bound = Q3 + 1.5 * IQR
165
-
166
- return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 
 
 
 
167
 
168
  @st.cache_data
 
169
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
170
- """Detect columns with mixed data types - cached"""
 
 
 
171
  mixed_type_issues = []
172
 
173
  for col in df.select_dtypes(include=['object']).columns:
174
- # Try to convert to numeric
175
- numeric_conversion = pd.to_numeric(df[col], errors='coerce')
176
- new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
177
-
178
- if new_nulls > 0:
179
- mixed_type_issues.append({
180
- 'column': col,
181
- 'problematic_values': new_nulls,
182
- 'total_values': len(df[col]),
183
- 'percentage': (new_nulls / len(df[col])) * 100
184
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  return mixed_type_issues
187
 
188
  @st.cache_data
189
- def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
190
- """Get value counts for categorical column - cached"""
 
 
 
 
191
  return df[column].value_counts().head(top_n)
192
 
193
  @st.cache_data
194
- def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
195
- """Calculate crosstab between two categorical columns - cached"""
196
- return pd.crosstab(df[col1], df[col2])
197
-
198
- @st.cache_data
199
- def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
200
- """Calculate group statistics - cached"""
201
- return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 
 
 
 
 
 
 
 
 
 
202
 
203
  @st.cache_data
 
204
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
205
- """Calculate overall data quality score - cached"""
 
 
 
206
  score = 100
207
  issues = []
 
208
 
209
- # Missing values penalty
210
- missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
211
- if missing_pct > 0:
212
- penalty = min(30, missing_pct * 2) # Max 30 points penalty
213
- score -= penalty
214
- issues.append(f"Missing values: {missing_pct:.1f}%")
215
-
216
- # Duplicates penalty
217
- duplicate_pct = (df.duplicated().sum() / len(df)) * 100
218
- if duplicate_pct > 0:
219
- penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
220
- score -= penalty
221
- issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
222
-
223
- # Constant columns penalty
224
- constant_cols = [col for col in df.columns if df[col].nunique() == 1]
225
- if constant_cols:
226
- penalty = min(10, len(constant_cols) * 2)
227
- score -= penalty
228
- issues.append(f"Constant columns: {len(constant_cols)}")
229
-
230
- # Mixed types penalty
231
- mixed_types = detect_mixed_types(df)
232
- if mixed_types:
233
- penalty = min(10, len(mixed_types) * 3)
234
- score -= penalty
235
- issues.append(f"Mixed type columns: {len(mixed_types)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- return {
238
- 'score': max(0, score),
239
- 'issues': issues,
240
- 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
241
- }
 
 
 
242
 
243
- def load_data(uploaded_file):
244
- """Unified data loading function"""
245
- file_content = uploaded_file.read()
246
- uploaded_file.seek(0)
247
-
248
- if uploaded_file.name.endswith('.csv'):
249
- return load_csv_with_encoding(file_content, uploaded_file.name)
250
- else:
251
- return load_excel_file(file_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
254
- """Apply data cleaning operations"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  cleaned_df = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- for operation in operations:
258
- if operation['type'] == 'fill_missing':
259
- if operation['method'] == 'mean':
260
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
261
- cleaned_df[operation['column']].mean())
262
- elif operation['method'] == 'median':
263
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
264
- cleaned_df[operation['column']].median())
265
- elif operation['method'] == 'mode':
266
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
267
- cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
268
- elif operation['method'] == 'drop':
269
- cleaned_df = cleaned_df.dropna(subset=[operation['column']])
270
-
271
- elif operation['type'] == 'remove_duplicates':
272
- cleaned_df = cleaned_df.drop_duplicates()
273
-
274
- elif operation['type'] == 'remove_outliers':
275
- Q1 = cleaned_df[operation['column']].quantile(0.25)
276
- Q3 = cleaned_df[operation['column']].quantile(0.75)
277
- IQR = Q3 - Q1
278
- lower_bound = Q1 - 1.5 * IQR
279
- upper_bound = Q3 + 1.5 * IQR
280
- cleaned_df = cleaned_df[
281
- (cleaned_df[operation['column']] >= lower_bound) &
282
- (cleaned_df[operation['column']] <= upper_bound)
283
- ]
284
-
285
- elif operation['type'] == 'cap_outliers':
286
- Q1 = cleaned_df[operation['column']].quantile(0.25)
287
- Q3 = cleaned_df[operation['column']].quantile(0.75)
288
- IQR = Q3 - Q1
289
- lower_bound = Q1 - 1.5 * IQR
290
- upper_bound = Q3 + 1.5 * IQR
291
- cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
292
-
293
- elif operation['type'] == 'convert_type':
294
- if operation['target_type'] == 'category':
295
- cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
296
-
297
- return cleaned_df
 
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
+ from typing import Dict, List, Any, Tuple, Optional
6
  from scipy import stats
7
+ import logging
8
+
9
  warnings.filterwarnings('ignore')
10
 
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Enhanced error handling decorator
16
+ def handle_errors(func):
17
+ """Decorator for consistent error handling"""
18
+ def wrapper(*args, **kwargs):
19
+ try:
20
+ return func(*args, **kwargs)
21
+ except Exception as e:
22
+ logger.error(f"Error in {func.__name__}: {str(e)}")
23
+ st.error(f"Error in {func.__name__}: {str(e)}")
24
+ return None
25
+ return wrapper
26
+
27
  @st.cache_data
28
+ @handle_errors
29
+ def load_csv_with_encoding(file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
30
+ """Load CSV with automatic encoding detection and enhanced error handling"""
 
 
 
 
31
  try:
32
+ import chardet
33
+ detected = chardet.detect(file_content)
34
+ encoding = detected.get('encoding', 'utf-8')
35
+
36
  from io import BytesIO
37
+ df = pd.read_csv(BytesIO(file_content), encoding=encoding)
38
+
39
+ # Validate loaded data
40
+ if df.empty:
41
+ raise ValueError("The uploaded file is empty")
42
+
43
+ if df.shape[1] == 1 and df.columns[0].count(',') > 0:
44
+ # Might be semicolon separated
45
+ df = pd.read_csv(BytesIO(file_content), encoding=encoding, sep=';')
46
+
47
+ logger.info(f"Successfully loaded CSV: {df.shape}")
48
+ return df
49
+
50
+ except Exception as e:
51
+ # Try alternative encodings
52
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
53
  for enc in encodings:
54
  try:
55
+ df = pd.read_csv(BytesIO(file_content), encoding=enc)
56
+ if not df.empty:
57
+ logger.info(f"Loaded CSV with encoding {enc}: {df.shape}")
58
+ return df
59
  except:
60
  continue
61
+
62
+ raise Exception(f"Cannot read file with any standard encoding. Original error: {str(e)}")
63
 
64
  @st.cache_data
65
+ @handle_errors
66
+ def load_excel_file(file_content: bytes) -> Optional[pd.DataFrame]:
67
+ """Load Excel file with enhanced error handling"""
68
  from io import BytesIO
69
+
70
+ try:
71
+ # Try loading first sheet
72
+ df = pd.read_excel(BytesIO(file_content))
73
+
74
+ if df.empty:
75
+ raise ValueError("The Excel file is empty")
76
+
77
+ logger.info(f"Successfully loaded Excel: {df.shape}")
78
+ return df
79
+
80
+ except Exception as e:
81
+ # Try with different engines
82
+ for engine in ['openpyxl', 'xlrd']:
83
+ try:
84
+ df = pd.read_excel(BytesIO(file_content), engine=engine)
85
+ if not df.empty:
86
+ logger.info(f"Loaded Excel with engine {engine}: {df.shape}")
87
+ return df
88
+ except:
89
+ continue
90
+
91
+ raise Exception(f"Cannot read Excel file. Error: {str(e)}")
92
 
93
  @st.cache_data
94
+ @handle_errors
95
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
96
+ """Calculate basic statistics with error handling"""
97
+ if df is None or df.empty:
98
+ return {}
99
+
100
  dtype_counts = df.dtypes.value_counts()
101
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
102
 
 
109
  }
110
 
111
  @st.cache_data
112
+ @handle_errors
113
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
114
+ """Calculate column cardinality analysis with improved categorization"""
115
+ if df is None or df.empty:
116
+ return pd.DataFrame()
117
+
118
  cardinality_data = []
119
 
120
  for col in df.columns:
121
+ try:
122
+ unique_count = df[col].nunique()
123
+ total_count = len(df)
124
+ unique_ratio = unique_count / total_count if total_count > 0 else 0
125
+
126
+ # Enhanced type classification
127
+ if unique_count == 1:
128
+ col_type = "Constant"
129
+ elif unique_count == total_count:
130
+ col_type = "Unique Identifier"
131
+ elif unique_ratio < 0.01:
132
+ col_type = "Very Low Cardinality"
133
+ elif unique_ratio < 0.05:
134
+ col_type = "Low Cardinality"
135
+ elif unique_ratio < 0.5:
136
+ col_type = "Medium Cardinality"
137
+ else:
138
+ col_type = "High Cardinality"
139
+
140
+ cardinality_data.append({
141
+ 'Column': col,
142
+ 'Unique Count': unique_count,
143
+ 'Total Count': total_count,
144
+ 'Unique Ratio': round(unique_ratio, 4),
145
+ 'Type': col_type,
146
+ 'Data Type': str(df[col].dtype)
147
+ })
148
+ except Exception as e:
149
+ logger.warning(f"Error processing column {col}: {str(e)}")
150
+ continue
151
 
152
  return pd.DataFrame(cardinality_data)
153
 
154
  @st.cache_data
155
+ @handle_errors
156
  def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
157
+ """Calculate memory optimization suggestions with validation"""
158
+ if df is None or df.empty:
159
+ return {'suggestions': [], 'current_memory_mb': 0, 'potential_savings_mb': 0, 'potential_savings_pct': 0}
160
+
161
  suggestions = []
162
  current_memory = df.memory_usage(deep=True).sum() / 1024**2
163
  potential_savings = 0
164
 
165
  for col in df.columns:
166
+ try:
167
+ if df[col].dtype == 'object' and not df[col].isnull().all():
168
+ unique_ratio = df[col].nunique() / len(df)
 
 
 
 
169
 
170
+ if unique_ratio < 0.5: # Less than 50% unique values
171
+ # Calculate potential savings
172
+ test_series = df[col].dropna().head(1000) # Sample for estimation
173
+ if len(test_series) > 0:
174
+ category_memory = test_series.astype('category').memory_usage(deep=True)
175
+ object_memory = test_series.memory_usage(deep=True)
176
+ savings_ratio = (object_memory - category_memory) / object_memory
177
+
178
+ if savings_ratio > 0.1: # More than 10% savings
179
+ estimated_savings = (df[col].memory_usage(deep=True) * savings_ratio) / 1024**2
180
+ suggestions.append({
181
+ 'Column': col,
182
+ 'Current Type': 'object',
183
+ 'Suggested Type': 'category',
184
+ 'Estimated Savings (MB)': round(estimated_savings, 2),
185
+ 'Unique Ratio': round(unique_ratio, 3)
186
+ })
187
+ potential_savings += estimated_savings
188
+
189
+ # Check for int64 that could be int32
190
+ elif df[col].dtype == 'int64':
191
+ if df[col].min() >= -2147483648 and df[col].max() <= 2147483647:
192
+ savings = df[col].memory_usage(deep=True) * 0.5 / 1024**2
193
  suggestions.append({
194
+ 'Column': col,
195
+ 'Current Type': 'int64',
196
+ 'Suggested Type': 'int32',
197
+ 'Estimated Savings (MB)': round(savings, 2),
198
+ 'Unique Ratio': 'N/A'
199
  })
200
  potential_savings += savings
201
+
202
+ except Exception as e:
203
+ logger.warning(f"Error analyzing memory for column {col}: {str(e)}")
204
+ continue
205
 
206
  return {
207
  'suggestions': suggestions,
208
+ 'current_memory_mb': round(current_memory, 2),
209
+ 'potential_savings_mb': round(potential_savings, 2),
210
+ 'potential_savings_pct': round((potential_savings / current_memory) * 100, 1) if current_memory > 0 else 0
211
  }
212
 
213
  @st.cache_data
214
+ @handle_errors
215
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
216
+ """Calculate missing data analysis with enhanced insights"""
217
+ if df is None or df.empty:
218
+ return pd.DataFrame()
219
+
220
  missing_data = df.isnull().sum()
221
+
222
  if missing_data.sum() > 0:
223
  missing_df = pd.DataFrame({
224
  'Column': missing_data.index,
225
  'Missing Count': missing_data.values,
226
+ 'Missing %': round((missing_data.values / len(df)) * 100, 2),
227
+ 'Data Type': [str(df[col].dtype) for col in missing_data.index]
228
  })
229
+
230
+ result = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
231
+ return result.reset_index(drop=True)
232
+
233
  return pd.DataFrame()
234
 
235
  @st.cache_data
236
+ @handle_errors
237
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
238
+ """Calculate correlation matrix with validation"""
239
+ if df is None or df.empty:
240
+ return pd.DataFrame()
241
+
242
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
243
+
244
+ if len(numeric_cols) < 2:
245
+ return pd.DataFrame()
246
+
247
+ # Remove columns with all NaN or constant values
248
+ valid_cols = []
249
+ for col in numeric_cols:
250
+ if not df[col].isnull().all() and df[col].nunique() > 1:
251
+ valid_cols.append(col)
252
+
253
+ if len(valid_cols) < 2:
254
+ return pd.DataFrame()
255
+
256
+ return df[valid_cols].corr()
257
 
258
  @st.cache_data
259
+ @handle_errors
260
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
261
+ """Enhanced column type detection"""
262
+ if df is None or df.empty:
263
+ return {'numeric': [], 'categorical': [], 'datetime': [], 'boolean': []}
264
+
265
+ result = {
266
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
267
+ 'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
268
+ 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
269
+ 'boolean': df.select_dtypes(include=['bool']).columns.tolist()
270
  }
271
+
272
+ # Auto-detect potential datetime columns in object type
273
+ potential_datetime = []
274
+ for col in result['categorical']:
275
+ if df[col].dtype == 'object':
276
+ sample = df[col].dropna().head(100)
277
+ if len(sample) > 0:
278
+ try:
279
+ pd.to_datetime(sample.iloc[0])
280
+ potential_datetime.append(col)
281
+ except:
282
+ pass
283
+
284
+ if potential_datetime:
285
+ result['potential_datetime'] = potential_datetime
286
+
287
+ return result
288
 
289
  @st.cache_data
290
+ @handle_errors
291
+ def calculate_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.DataFrame:
292
+ """Enhanced outlier detection with multiple methods"""
293
+ if df is None or df.empty or column not in df.columns:
294
+ return pd.DataFrame()
295
+
296
+ if not pd.api.types.is_numeric_dtype(df[column]):
297
+ return pd.DataFrame()
298
+
299
  series = df[column].dropna()
300
+ if len(series) == 0:
301
+ return pd.DataFrame()
302
+
303
+ if method == 'iqr':
304
+ Q1 = series.quantile(0.25)
305
+ Q3 = series.quantile(0.75)
306
+ IQR = Q3 - Q1
307
+
308
+ if IQR == 0: # All values are the same
309
+ return pd.DataFrame()
310
+
311
+ lower_bound = Q1 - 1.5 * IQR
312
+ upper_bound = Q3 + 1.5 * IQR
313
+ outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
314
+
315
+ elif method == 'zscore':
316
+ z_scores = np.abs(stats.zscore(series))
317
+ outlier_indices = series.index[z_scores > 3]
318
+ outlier_mask = df.index.isin(outlier_indices)
319
+
320
+ else: # percentile
321
+ lower_bound = series.quantile(0.01)
322
+ upper_bound = series.quantile(0.99)
323
+ outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
324
+
325
+ return df[outlier_mask]
326
 
327
  @st.cache_data
328
+ @handle_errors
329
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
330
+ """Detect columns with mixed data types and provide detailed analysis"""
331
+ if df is None or df.empty:
332
+ return []
333
+
334
  mixed_type_issues = []
335
 
336
  for col in df.select_dtypes(include=['object']).columns:
337
+ try:
338
+ # Skip if all values are null
339
+ if df[col].isnull().all():
340
+ continue
341
+
342
+ # Try numeric conversion
343
+ numeric_conversion = pd.to_numeric(df[col], errors='coerce')
344
+ new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
345
+
346
+ if new_nulls > 0 and new_nulls < len(df[col]) * 0.9: # Not too many conversion failures
347
+ # Find problematic values
348
+ original_not_null = df[col].notnull()
349
+ converted_null = numeric_conversion.isnull()
350
+ problematic_mask = original_not_null & converted_null
351
+
352
+ if problematic_mask.sum() > 0:
353
+ sample_problems = df[col][problematic_mask].value_counts().head(5)
354
+
355
+ mixed_type_issues.append({
356
+ 'column': col,
357
+ 'problematic_values': int(new_nulls),
358
+ 'total_values': int(len(df[col])),
359
+ 'percentage': round((new_nulls / len(df[col])) * 100, 2),
360
+ 'sample_issues': sample_problems.to_dict()
361
+ })
362
+
363
+ except Exception as e:
364
+ logger.warning(f"Error analyzing mixed types for column {col}: {str(e)}")
365
+ continue
366
 
367
  return mixed_type_issues
368
 
369
  @st.cache_data
370
+ @handle_errors
371
+ def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> Optional[pd.Series]:
372
+ """Get value counts with validation"""
373
+ if df is None or df.empty or column not in df.columns:
374
+ return pd.Series()
375
+
376
  return df[column].value_counts().head(top_n)
377
 
378
  @st.cache_data
379
+ @handle_errors
380
+ def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> Optional[pd.DataFrame]:
381
+ """Calculate group statistics with validation"""
382
+ if df is None or df.empty or group_col not in df.columns or metric_col not in df.columns:
383
+ return pd.DataFrame()
384
+
385
+ if not pd.api.types.is_numeric_dtype(df[metric_col]):
386
+ return pd.DataFrame()
387
+
388
+ # Limit to top groups for performance
389
+ top_groups = df[group_col].value_counts().head(20).index
390
+ filtered_df = df[df[group_col].isin(top_groups)]
391
+
392
+ stats_df = filtered_df.groupby(group_col)[metric_col].agg([
393
+ 'count', 'mean', 'median', 'std', 'min', 'max'
394
+ ]).round(3)
395
+
396
+ return stats_df.reset_index()
397
 
398
  @st.cache_data
399
+ @handle_errors
400
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
401
+ """Enhanced data quality scoring with detailed feedback"""
402
+ if df is None or df.empty:
403
+ return {'score': 0, 'issues': ['Dataset is empty'], 'grade': 'F'}
404
+
405
  score = 100
406
  issues = []
407
+ recommendations = []
408
 
409
+ try:
410
+ # Missing values assessment
411
+ total_cells = len(df) * len(df.columns)
412
+ missing_count = df.isnull().sum().sum()
413
+ missing_pct = (missing_count / total_cells) * 100 if total_cells > 0 else 0
414
+
415
+ if missing_pct > 0:
416
+ penalty = min(25, missing_pct * 2)
417
+ score -= penalty
418
+ issues.append(f"Missing values: {missing_pct:.1f}% of total data")
419
+
420
+ if missing_pct < 5:
421
+ recommendations.append("Low missing data - consider simple imputation")
422
+ elif missing_pct < 20:
423
+ recommendations.append("Moderate missing data - analyze patterns before imputation")
424
+ else:
425
+ recommendations.append("High missing data - investigate data collection process")
426
+
427
+ # Duplicates assessment
428
+ duplicate_count = df.duplicated().sum()
429
+ duplicate_pct = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
430
+
431
+ if duplicate_pct > 0:
432
+ penalty = min(20, duplicate_pct * 3)
433
+ score -= penalty
434
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
435
+ recommendations.append("Remove or investigate duplicate records")
436
+
437
+ # Constant columns
438
+ constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
439
+ if constant_cols:
440
+ penalty = min(15, len(constant_cols) * 2)
441
+ score -= penalty
442
+ issues.append(f"Constant columns: {len(constant_cols)} columns have no variation")
443
+ recommendations.append("Consider removing constant columns")
444
+
445
+ # Mixed types
446
+ mixed_types = detect_mixed_types(df)
447
+ if mixed_types:
448
+ penalty = min(15, len(mixed_types) * 3)
449
+ score -= penalty
450
+ issues.append(f"Mixed data types: {len(mixed_types)} columns need type conversion")
451
+ recommendations.append("Fix data type inconsistencies")
452
+
453
+ # Data size assessment
454
+ if len(df) < 10:
455
+ score -= 10
456
+ issues.append("Very small dataset - statistical power may be limited")
457
+
458
+ # Grade assignment
459
+ if score >= 95:
460
+ grade = 'A+'
461
+ elif score >= 90:
462
+ grade = 'A'
463
+ elif score >= 85:
464
+ grade = 'B+'
465
+ elif score >= 80:
466
+ grade = 'B'
467
+ elif score >= 75:
468
+ grade = 'C+'
469
+ elif score >= 70:
470
+ grade = 'C'
471
+ elif score >= 60:
472
+ grade = 'D'
473
+ else:
474
+ grade = 'F'
475
+
476
+ return {
477
+ 'score': max(0, round(score, 1)),
478
+ 'issues': issues,
479
+ 'recommendations': recommendations,
480
+ 'grade': grade
481
+ }
482
 
483
+ except Exception as e:
484
+ logger.error(f"Error calculating data quality: {str(e)}")
485
+ return {
486
+ 'score': 0,
487
+ 'issues': [f"Error calculating quality score: {str(e)}"],
488
+ 'recommendations': ['Please check your data format'],
489
+ 'grade': 'Error'
490
+ }
491
 
492
+ def load_data(uploaded_file) -> Optional[pd.DataFrame]:
493
+ """Unified data loading with comprehensive error handling"""
494
+ if uploaded_file is None:
495
+ return None
496
+
497
+ try:
498
+ file_content = uploaded_file.read()
499
+ uploaded_file.seek(0)
500
+
501
+ file_size_mb = len(file_content) / 1024**2
502
+
503
+ # File size warning
504
+ if file_size_mb > 100:
505
+ st.warning(f"⚠️ Large file detected ({file_size_mb:.1f} MB). Processing may take longer.")
506
+
507
+ # Load based on file extension
508
+ if uploaded_file.name.lower().endswith('.csv'):
509
+ df = load_csv_with_encoding(file_content, uploaded_file.name)
510
+ elif uploaded_file.name.lower().endswith(('.xlsx', '.xls')):
511
+ df = load_excel_file(file_content)
512
+ else:
513
+ raise ValueError(f"Unsupported file format: {uploaded_file.name}")
514
+
515
+ if df is None:
516
+ raise ValueError("Failed to load data from file")
517
+
518
+ # Additional validations
519
+ if df.empty:
520
+ raise ValueError("The uploaded file contains no data")
521
+
522
+ if len(df.columns) == 0:
523
+ raise ValueError("No columns found in the dataset")
524
+
525
+ # Clean column names
526
+ df.columns = df.columns.astype(str).str.strip()
527
+
528
+ # Remove completely empty rows/columns
529
+ df = df.dropna(how='all').dropna(axis=1, how='all')
530
+
531
+ if df.empty:
532
+ raise ValueError("No valid data remaining after cleaning empty rows/columns")
533
+
534
+ logger.info(f"Successfully loaded and validated data: {df.shape}")
535
+ return df
536
+
537
+ except Exception as e:
538
+ error_msg = f"Failed to load data: {str(e)}"
539
+ logger.error(error_msg)
540
+ st.error(error_msg)
541
+ st.info("💡 **Tips for successful upload:**\n"
542
+ "- Ensure file is not corrupted\n"
543
+ "- Check file encoding (UTF-8 recommended)\n"
544
+ "- Verify file has proper headers\n"
545
+ "- File size should be under 200MB for optimal performance")
546
+ return None
547
 
548
+ @handle_errors
549
+ def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, List[str]]:
550
+ """Validate dataframe for analysis readiness"""
551
+ if df is None:
552
+ return False, ["No dataframe provided"]
553
+
554
+ issues = []
555
+
556
+ if df.empty:
557
+ issues.append("Dataset is empty")
558
+
559
+ if len(df.columns) == 0:
560
+ issues.append("No columns found")
561
+
562
+ if len(df) < 2:
563
+ issues.append("Insufficient data for analysis (minimum 2 rows required)")
564
+
565
+ # Check for problematic column names
566
+ problematic_cols = [col for col in df.columns if not isinstance(col, str) or col.strip() == '']
567
+ if problematic_cols:
568
+ issues.append(f"Problematic column names detected: {len(problematic_cols)} columns")
569
+
570
+ return len(issues) == 0, issues
571
+
572
+ @handle_errors
573
+ def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Optional[pd.DataFrame]:
574
+ """Apply data cleaning operations with validation and rollback capability"""
575
+ if df is None or df.empty:
576
+ return df
577
+
578
  cleaned_df = df.copy()
579
+ applied_operations = []
580
+
581
+ try:
582
+ for operation in operations:
583
+ operation_type = operation.get('type')
584
+ column = operation.get('column')
585
+
586
+ # Validate operation
587
+ if operation_type == 'fill_missing' and column in cleaned_df.columns:
588
+ method = operation.get('method', 'mean')
589
+
590
+ if method == 'mean' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
591
+ fill_value = cleaned_df[column].mean()
592
+ elif method == 'median' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
593
+ fill_value = cleaned_df[column].median()
594
+ elif method == 'mode':
595
+ mode_values = cleaned_df[column].mode()
596
+ fill_value = mode_values.iloc[0] if not mode_values.empty else 'Unknown'
597
+ elif method == 'drop':
598
+ original_len = len(cleaned_df)
599
+ cleaned_df = cleaned_df.dropna(subset=[column])
600
+ applied_operations.append(f"Dropped {original_len - len(cleaned_df)} rows with missing {column}")
601
+ continue
602
+ else:
603
+ fill_value = operation.get('value', 0)
604
+
605
+ original_missing = cleaned_df[column].isnull().sum()
606
+ cleaned_df[column] = cleaned_df[column].fillna(fill_value)
607
+ applied_operations.append(f"Filled {original_missing} missing values in {column} using {method}")
608
+
609
+ elif operation_type == 'remove_duplicates':
610
+ original_len = len(cleaned_df)
611
+ cleaned_df = cleaned_df.drop_duplicates()
612
+ removed = original_len - len(cleaned_df)
613
+ applied_operations.append(f"Removed {removed} duplicate rows")
614
+
615
+ elif operation_type == 'remove_outliers' and column in cleaned_df.columns:
616
+ original_len = len(cleaned_df)
617
+ outliers = calculate_outliers(cleaned_df, column)
618
+ if outliers is not None and not outliers.empty:
619
+ cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
620
+ removed = original_len - len(cleaned_df)
621
+ applied_operations.append(f"Removed {removed} outliers from {column}")
622
+
623
+ logger.info(f"Applied {len(applied_operations)} cleaning operations")
624
+ return cleaned_df
625
 
626
+ except Exception as e:
627
+ error_msg = f"Error during data cleaning: {str(e)}"
628
+ logger.error(error_msg)
629
+ st.error(error_msg)
630
+ return df # Return original data if cleaning fails