entropy25 commited on
Commit
ed4ea1f
·
verified ·
1 Parent(s): 2fad68d

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +196 -529
data_handler.py CHANGED
@@ -2,101 +2,40 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
- from typing import Dict, List, Any, Tuple, Optional
6
  from scipy import stats
7
- import logging
8
-
9
  warnings.filterwarnings('ignore')
10
 
11
- # Configure logging
12
- logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
-
15
- # Enhanced error handling decorator
16
- def handle_errors(func):
17
- """Decorator for consistent error handling"""
18
- def wrapper(*args, **kwargs):
19
- try:
20
- return func(*args, **kwargs)
21
- except Exception as e:
22
- logger.error(f"Error in {func.__name__}: {str(e)}")
23
- st.error(f"Error in {func.__name__}: {str(e)}")
24
- return None
25
- return wrapper
26
-
27
  @st.cache_data
28
- @handle_errors
29
- def load_csv_with_encoding(file_content: bytes, filename: str) -> Optional[pd.DataFrame]:
30
- """Load CSV with automatic encoding detection and enhanced error handling"""
 
 
 
 
31
  try:
32
- import chardet
33
- detected = chardet.detect(file_content)
34
- encoding = detected.get('encoding', 'utf-8')
35
-
36
  from io import BytesIO
37
- df = pd.read_csv(BytesIO(file_content), encoding=encoding)
38
-
39
- # Validate loaded data
40
- if df.empty:
41
- raise ValueError("The uploaded file is empty")
42
-
43
- if df.shape[1] == 1 and df.columns[0].count(',') > 0:
44
- # Might be semicolon separated
45
- df = pd.read_csv(BytesIO(file_content), encoding=encoding, sep=';')
46
-
47
- logger.info(f"Successfully loaded CSV: {df.shape}")
48
- return df
49
-
50
- except Exception as e:
51
- # Try alternative encodings
52
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
53
  for enc in encodings:
54
  try:
55
- df = pd.read_csv(BytesIO(file_content), encoding=enc)
56
- if not df.empty:
57
- logger.info(f"Loaded CSV with encoding {enc}: {df.shape}")
58
- return df
59
  except:
60
  continue
61
-
62
- raise Exception(f"Cannot read file with any standard encoding. Original error: {str(e)}")
63
 
64
  @st.cache_data
65
- @handle_errors
66
- def load_excel_file(file_content: bytes) -> Optional[pd.DataFrame]:
67
- """Load Excel file with enhanced error handling"""
68
  from io import BytesIO
69
-
70
- try:
71
- # Try loading first sheet
72
- df = pd.read_excel(BytesIO(file_content))
73
-
74
- if df.empty:
75
- raise ValueError("The Excel file is empty")
76
-
77
- logger.info(f"Successfully loaded Excel: {df.shape}")
78
- return df
79
-
80
- except Exception as e:
81
- # Try with different engines
82
- for engine in ['openpyxl', 'xlrd']:
83
- try:
84
- df = pd.read_excel(BytesIO(file_content), engine=engine)
85
- if not df.empty:
86
- logger.info(f"Loaded Excel with engine {engine}: {df.shape}")
87
- return df
88
- except:
89
- continue
90
-
91
- raise Exception(f"Cannot read Excel file. Error: {str(e)}")
92
 
93
  @st.cache_data
94
- @handle_errors
95
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
96
- """Calculate basic statistics with error handling"""
97
- if df is None or df.empty:
98
- return {}
99
-
100
  dtype_counts = df.dtypes.value_counts()
101
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
102
 
@@ -109,522 +48,250 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
109
  }
110
 
111
  @st.cache_data
112
- @handle_errors
113
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
114
- """Calculate column cardinality analysis with improved categorization"""
115
- if df is None or df.empty:
116
- return pd.DataFrame()
117
-
118
  cardinality_data = []
119
 
120
  for col in df.columns:
121
- try:
122
- unique_count = df[col].nunique()
123
- total_count = len(df)
124
- unique_ratio = unique_count / total_count if total_count > 0 else 0
125
-
126
- # Enhanced type classification
127
- if unique_count == 1:
128
- col_type = "Constant"
129
- elif unique_count == total_count:
130
- col_type = "Unique Identifier"
131
- elif unique_ratio < 0.01:
132
- col_type = "Very Low Cardinality"
133
- elif unique_ratio < 0.05:
134
- col_type = "Low Cardinality"
135
- elif unique_ratio < 0.5:
136
- col_type = "Medium Cardinality"
137
- else:
138
- col_type = "High Cardinality"
139
-
140
- cardinality_data.append({
141
- 'Column': col,
142
- 'Unique Count': unique_count,
143
- 'Total Count': total_count,
144
- 'Unique Ratio': round(unique_ratio, 4),
145
- 'Type': col_type,
146
- 'Data Type': str(df[col].dtype)
147
- })
148
- except Exception as e:
149
- logger.warning(f"Error processing column {col}: {str(e)}")
150
- continue
151
 
152
  return pd.DataFrame(cardinality_data)
153
 
154
  @st.cache_data
155
- @handle_errors
156
  def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
157
- """Calculate memory optimization suggestions with validation"""
158
- if df is None or df.empty:
159
- return {'suggestions': [], 'current_memory_mb': 0, 'potential_savings_mb': 0, 'potential_savings_pct': 0}
160
-
161
  suggestions = []
162
  current_memory = df.memory_usage(deep=True).sum() / 1024**2
163
  potential_savings = 0
164
 
165
  for col in df.columns:
166
- try:
167
- if df[col].dtype == 'object' and not df[col].isnull().all():
168
- unique_ratio = df[col].nunique() / len(df)
 
 
 
 
169
 
170
- if unique_ratio < 0.5: # Less than 50% unique values
171
- # Calculate potential savings
172
- test_series = df[col].dropna().head(1000) # Sample for estimation
173
- if len(test_series) > 0:
174
- category_memory = test_series.astype('category').memory_usage(deep=True)
175
- object_memory = test_series.memory_usage(deep=True)
176
- savings_ratio = (object_memory - category_memory) / object_memory
177
-
178
- if savings_ratio > 0.1: # More than 10% savings
179
- estimated_savings = (df[col].memory_usage(deep=True) * savings_ratio) / 1024**2
180
- suggestions.append({
181
- 'Column': col,
182
- 'Current Type': 'object',
183
- 'Suggested Type': 'category',
184
- 'Estimated Savings (MB)': round(estimated_savings, 2),
185
- 'Unique Ratio': round(unique_ratio, 3)
186
- })
187
- potential_savings += estimated_savings
188
-
189
- # Check for int64 that could be int32
190
- elif df[col].dtype == 'int64':
191
- if df[col].min() >= -2147483648 and df[col].max() <= 2147483647:
192
- savings = df[col].memory_usage(deep=True) * 0.5 / 1024**2
193
  suggestions.append({
194
- 'Column': col,
195
- 'Current Type': 'int64',
196
- 'Suggested Type': 'int32',
197
- 'Estimated Savings (MB)': round(savings, 2),
198
- 'Unique Ratio': 'N/A'
199
  })
200
  potential_savings += savings
201
-
202
- except Exception as e:
203
- logger.warning(f"Error analyzing memory for column {col}: {str(e)}")
204
- continue
205
 
206
  return {
207
  'suggestions': suggestions,
208
- 'current_memory_mb': round(current_memory, 2),
209
- 'potential_savings_mb': round(potential_savings, 2),
210
- 'potential_savings_pct': round((potential_savings / current_memory) * 100, 1) if current_memory > 0 else 0
211
  }
212
 
213
  @st.cache_data
214
- @handle_errors
215
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
216
- """Calculate missing data analysis with enhanced insights"""
217
- if df is None or df.empty:
218
- return pd.DataFrame()
219
-
220
  missing_data = df.isnull().sum()
221
-
222
  if missing_data.sum() > 0:
223
  missing_df = pd.DataFrame({
224
  'Column': missing_data.index,
225
  'Missing Count': missing_data.values,
226
- 'Missing %': round((missing_data.values / len(df)) * 100, 2),
227
- 'Data Type': [str(df[col].dtype) for col in missing_data.index]
228
  })
229
-
230
- result = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
231
- return result.reset_index(drop=True)
232
-
233
  return pd.DataFrame()
234
 
235
  @st.cache_data
236
- @handle_errors
237
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
238
- """Calculate correlation matrix with validation"""
239
- if df is None or df.empty:
240
- return pd.DataFrame()
241
-
242
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
243
-
244
- if len(numeric_cols) < 2:
245
- return pd.DataFrame()
246
-
247
- # Remove columns with all NaN or constant values
248
- valid_cols = []
249
- for col in numeric_cols:
250
- if not df[col].isnull().all() and df[col].nunique() > 1:
251
- valid_cols.append(col)
252
-
253
- if len(valid_cols) < 2:
254
- return pd.DataFrame()
255
-
256
- return df[valid_cols].corr()
257
 
258
  @st.cache_data
259
- @handle_errors
260
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
261
- """Enhanced column type detection"""
262
- if df is None or df.empty:
263
- return {'numeric': [], 'categorical': [], 'datetime': [], 'boolean': []}
264
-
265
- result = {
266
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
267
- 'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
268
- 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
269
- 'boolean': df.select_dtypes(include=['bool']).columns.tolist()
270
  }
271
-
272
- # Auto-detect potential datetime columns in object type
273
- potential_datetime = []
274
- for col in result['categorical']:
275
- if df[col].dtype == 'object':
276
- sample = df[col].dropna().head(100)
277
- if len(sample) > 0:
278
- try:
279
- pd.to_datetime(sample.iloc[0])
280
- potential_datetime.append(col)
281
- except:
282
- pass
283
-
284
- if potential_datetime:
285
- result['potential_datetime'] = potential_datetime
286
-
287
- return result
288
 
289
  @st.cache_data
290
- @handle_errors
291
- def calculate_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> pd.DataFrame:
292
- """Enhanced outlier detection with multiple methods"""
293
- if df is None or df.empty or column not in df.columns:
294
- return pd.DataFrame()
295
-
296
- if not pd.api.types.is_numeric_dtype(df[column]):
297
- return pd.DataFrame()
298
-
299
  series = df[column].dropna()
300
- if len(series) == 0:
301
- return pd.DataFrame()
302
-
303
- if method == 'iqr':
304
- Q1 = series.quantile(0.25)
305
- Q3 = series.quantile(0.75)
306
- IQR = Q3 - Q1
307
-
308
- if IQR == 0: # All values are the same
309
- return pd.DataFrame()
310
-
311
- lower_bound = Q1 - 1.5 * IQR
312
- upper_bound = Q3 + 1.5 * IQR
313
- outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
314
-
315
- elif method == 'zscore':
316
- z_scores = np.abs(stats.zscore(series))
317
- outlier_indices = series.index[z_scores > 3]
318
- outlier_mask = df.index.isin(outlier_indices)
319
-
320
- else: # percentile
321
- lower_bound = series.quantile(0.01)
322
- upper_bound = series.quantile(0.99)
323
- outlier_mask = (df[column] < lower_bound) | (df[column] > upper_bound)
324
-
325
- return df[outlier_mask]
326
 
327
  @st.cache_data
328
- @handle_errors
329
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
330
- """Detect columns with mixed data types and provide detailed analysis"""
331
- if df is None or df.empty:
332
- return []
333
-
334
  mixed_type_issues = []
335
 
336
  for col in df.select_dtypes(include=['object']).columns:
337
- try:
338
- # Skip if all values are null
339
- if df[col].isnull().all():
340
- continue
341
-
342
- # Try numeric conversion
343
- numeric_conversion = pd.to_numeric(df[col], errors='coerce')
344
- new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
345
-
346
- if new_nulls > 0 and new_nulls < len(df[col]) * 0.9: # Not too many conversion failures
347
- # Find problematic values
348
- original_not_null = df[col].notnull()
349
- converted_null = numeric_conversion.isnull()
350
- problematic_mask = original_not_null & converted_null
351
-
352
- if problematic_mask.sum() > 0:
353
- sample_problems = df[col][problematic_mask].value_counts().head(5)
354
-
355
- mixed_type_issues.append({
356
- 'column': col,
357
- 'problematic_values': int(new_nulls),
358
- 'total_values': int(len(df[col])),
359
- 'percentage': round((new_nulls / len(df[col])) * 100, 2),
360
- 'sample_issues': sample_problems.to_dict()
361
- })
362
-
363
- except Exception as e:
364
- logger.warning(f"Error analyzing mixed types for column {col}: {str(e)}")
365
- continue
366
 
367
  return mixed_type_issues
368
 
369
  @st.cache_data
370
- @handle_errors
371
- def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> Optional[pd.Series]:
372
- """Get value counts with validation"""
373
- if df is None or df.empty or column not in df.columns:
374
- return pd.Series()
375
-
376
  return df[column].value_counts().head(top_n)
377
 
378
  @st.cache_data
379
- @handle_errors
380
- def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> Optional[pd.DataFrame]:
381
- """Calculate group statistics with validation"""
382
- if df is None or df.empty or group_col not in df.columns or metric_col not in df.columns:
383
- return pd.DataFrame()
384
-
385
- if not pd.api.types.is_numeric_dtype(df[metric_col]):
386
- return pd.DataFrame()
387
-
388
- # Limit to top groups for performance
389
- top_groups = df[group_col].value_counts().head(20).index
390
- filtered_df = df[df[group_col].isin(top_groups)]
391
-
392
- stats_df = filtered_df.groupby(group_col)[metric_col].agg([
393
- 'count', 'mean', 'median', 'std', 'min', 'max'
394
- ]).round(3)
395
-
396
- return stats_df.reset_index()
397
 
398
  @st.cache_data
399
- @handle_errors
400
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
401
- """Enhanced data quality scoring with detailed feedback"""
402
- if df is None or df.empty:
403
- return {'score': 0, 'issues': ['Dataset is empty'], 'grade': 'F'}
404
-
405
  score = 100
406
  issues = []
407
- recommendations = []
408
-
409
- try:
410
- # Missing values assessment
411
- total_cells = len(df) * len(df.columns)
412
- missing_count = df.isnull().sum().sum()
413
- missing_pct = (missing_count / total_cells) * 100 if total_cells > 0 else 0
414
-
415
- if missing_pct > 0:
416
- penalty = min(25, missing_pct * 2)
417
- score -= penalty
418
- issues.append(f"Missing values: {missing_pct:.1f}% of total data")
419
-
420
- if missing_pct < 5:
421
- recommendations.append("Low missing data - consider simple imputation")
422
- elif missing_pct < 20:
423
- recommendations.append("Moderate missing data - analyze patterns before imputation")
424
- else:
425
- recommendations.append("High missing data - investigate data collection process")
426
-
427
- # Duplicates assessment
428
- duplicate_count = df.duplicated().sum()
429
- duplicate_pct = (duplicate_count / len(df)) * 100 if len(df) > 0 else 0
430
-
431
- if duplicate_pct > 0:
432
- penalty = min(20, duplicate_pct * 3)
433
- score -= penalty
434
- issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
435
- recommendations.append("Remove or investigate duplicate records")
436
-
437
- # Constant columns
438
- constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
439
- if constant_cols:
440
- penalty = min(15, len(constant_cols) * 2)
441
- score -= penalty
442
- issues.append(f"Constant columns: {len(constant_cols)} columns have no variation")
443
- recommendations.append("Consider removing constant columns")
444
-
445
- # Mixed types
446
- mixed_types = detect_mixed_types(df)
447
- if mixed_types:
448
- penalty = min(15, len(mixed_types) * 3)
449
- score -= penalty
450
- issues.append(f"Mixed data types: {len(mixed_types)} columns need type conversion")
451
- recommendations.append("Fix data type inconsistencies")
452
-
453
- # Data size assessment
454
- if len(df) < 10:
455
- score -= 10
456
- issues.append("Very small dataset - statistical power may be limited")
457
-
458
- # Grade assignment
459
- if score >= 95:
460
- grade = 'A+'
461
- elif score >= 90:
462
- grade = 'A'
463
- elif score >= 85:
464
- grade = 'B+'
465
- elif score >= 80:
466
- grade = 'B'
467
- elif score >= 75:
468
- grade = 'C+'
469
- elif score >= 70:
470
- grade = 'C'
471
- elif score >= 60:
472
- grade = 'D'
473
- else:
474
- grade = 'F'
475
-
476
- return {
477
- 'score': max(0, round(score, 1)),
478
- 'issues': issues,
479
- 'recommendations': recommendations,
480
- 'grade': grade
481
- }
482
 
483
- except Exception as e:
484
- logger.error(f"Error calculating data quality: {str(e)}")
485
- return {
486
- 'score': 0,
487
- 'issues': [f"Error calculating quality score: {str(e)}"],
488
- 'recommendations': ['Please check your data format'],
489
- 'grade': 'Error'
490
- }
491
-
492
- def load_data(uploaded_file) -> Optional[pd.DataFrame]:
493
- """Unified data loading with comprehensive error handling"""
494
- if uploaded_file is None:
495
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
- try:
498
- file_content = uploaded_file.read()
499
- uploaded_file.seek(0)
500
-
501
- file_size_mb = len(file_content) / 1024**2
502
-
503
- # File size warning
504
- if file_size_mb > 100:
505
- st.warning(f"⚠️ Large file detected ({file_size_mb:.1f} MB). Processing may take longer.")
506
-
507
- # Load based on file extension
508
- if uploaded_file.name.lower().endswith('.csv'):
509
- df = load_csv_with_encoding(file_content, uploaded_file.name)
510
- elif uploaded_file.name.lower().endswith(('.xlsx', '.xls')):
511
- df = load_excel_file(file_content)
512
- else:
513
- raise ValueError(f"Unsupported file format: {uploaded_file.name}")
514
-
515
- if df is None:
516
- raise ValueError("Failed to load data from file")
517
-
518
- # Additional validations
519
- if df.empty:
520
- raise ValueError("The uploaded file contains no data")
521
-
522
- if len(df.columns) == 0:
523
- raise ValueError("No columns found in the dataset")
524
-
525
- # Clean column names
526
- df.columns = df.columns.astype(str).str.strip()
527
-
528
- # Remove completely empty rows/columns
529
- df = df.dropna(how='all').dropna(axis=1, how='all')
530
-
531
- if df.empty:
532
- raise ValueError("No valid data remaining after cleaning empty rows/columns")
533
-
534
- logger.info(f"Successfully loaded and validated data: {df.shape}")
535
- return df
536
-
537
- except Exception as e:
538
- error_msg = f"Failed to load data: {str(e)}"
539
- logger.error(error_msg)
540
- st.error(error_msg)
541
- st.info("💡 **Tips for successful upload:**\n"
542
- "- Ensure file is not corrupted\n"
543
- "- Check file encoding (UTF-8 recommended)\n"
544
- "- Verify file has proper headers\n"
545
- "- File size should be under 200MB for optimal performance")
546
- return None
547
 
548
- @handle_errors
549
- def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, List[str]]:
550
- """Validate dataframe for analysis readiness"""
551
- if df is None:
552
- return False, ["No dataframe provided"]
553
-
554
- issues = []
555
-
556
- if df.empty:
557
- issues.append("Dataset is empty")
558
-
559
- if len(df.columns) == 0:
560
- issues.append("No columns found")
561
-
562
- if len(df) < 2:
563
- issues.append("Insufficient data for analysis (minimum 2 rows required)")
564
-
565
- # Check for problematic column names
566
- problematic_cols = [col for col in df.columns if not isinstance(col, str) or col.strip() == '']
567
- if problematic_cols:
568
- issues.append(f"Problematic column names detected: {len(problematic_cols)} columns")
569
-
570
- return len(issues) == 0, issues
571
 
572
- @handle_errors
573
- def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Optional[pd.DataFrame]:
574
- """Apply data cleaning operations with validation and rollback capability"""
575
- if df is None or df.empty:
576
- return df
577
-
578
  cleaned_df = df.copy()
579
- applied_operations = []
580
-
581
- try:
582
- for operation in operations:
583
- operation_type = operation.get('type')
584
- column = operation.get('column')
585
-
586
- # Validate operation
587
- if operation_type == 'fill_missing' and column in cleaned_df.columns:
588
- method = operation.get('method', 'mean')
589
-
590
- if method == 'mean' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
591
- fill_value = cleaned_df[column].mean()
592
- elif method == 'median' and pd.api.types.is_numeric_dtype(cleaned_df[column]):
593
- fill_value = cleaned_df[column].median()
594
- elif method == 'mode':
595
- mode_values = cleaned_df[column].mode()
596
- fill_value = mode_values.iloc[0] if not mode_values.empty else 'Unknown'
597
- elif method == 'drop':
598
- original_len = len(cleaned_df)
599
- cleaned_df = cleaned_df.dropna(subset=[column])
600
- applied_operations.append(f"Dropped {original_len - len(cleaned_df)} rows with missing {column}")
601
- continue
602
- else:
603
- fill_value = operation.get('value', 0)
604
-
605
- original_missing = cleaned_df[column].isnull().sum()
606
- cleaned_df[column] = cleaned_df[column].fillna(fill_value)
607
- applied_operations.append(f"Filled {original_missing} missing values in {column} using {method}")
608
-
609
- elif operation_type == 'remove_duplicates':
610
- original_len = len(cleaned_df)
611
- cleaned_df = cleaned_df.drop_duplicates()
612
- removed = original_len - len(cleaned_df)
613
- applied_operations.append(f"Removed {removed} duplicate rows")
614
-
615
- elif operation_type == 'remove_outliers' and column in cleaned_df.columns:
616
- original_len = len(cleaned_df)
617
- outliers = calculate_outliers(cleaned_df, column)
618
- if outliers is not None and not outliers.empty:
619
- cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
620
- removed = original_len - len(cleaned_df)
621
- applied_operations.append(f"Removed {removed} outliers from {column}")
622
-
623
- logger.info(f"Applied {len(applied_operations)} cleaning operations")
624
- return cleaned_df
625
 
626
- except Exception as e:
627
- error_msg = f"Error during data cleaning: {str(e)}"
628
- logger.error(error_msg)
629
- st.error(error_msg)
630
- return df # Return original data if cleaning fails
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
+ from typing import Dict, List, Any, Tuple
6
  from scipy import stats
 
 
7
  warnings.filterwarnings('ignore')
8
 
9
+ # All cached data processing functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @st.cache_data
11
+ def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
12
+ """Load CSV with automatic encoding detection - cached"""
13
+ import chardet
14
+
15
+ detected = chardet.detect(file_content)
16
+ encoding = detected['encoding']
17
+
18
  try:
 
 
 
 
19
  from io import BytesIO
20
+ return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
+ except:
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
23
  for enc in encodings:
24
  try:
25
+ return pd.read_csv(BytesIO(file_content), encoding=enc)
 
 
 
26
  except:
27
  continue
28
+ raise Exception("Cannot read file with any encoding")
 
29
 
30
  @st.cache_data
31
+ def load_excel_file(file_content: bytes) -> pd.DataFrame:
32
+ """Load Excel file - cached"""
 
33
  from io import BytesIO
34
+ return pd.read_excel(BytesIO(file_content))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @st.cache_data
 
37
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
38
+ """Calculate basic statistics - cached"""
 
 
 
39
  dtype_counts = df.dtypes.value_counts()
40
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
41
 
 
48
  }
49
 
50
  @st.cache_data
 
51
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
52
+ """Calculate column cardinality analysis - cached"""
 
 
 
53
  cardinality_data = []
54
 
55
  for col in df.columns:
56
+ unique_count = df[col].nunique()
57
+ unique_ratio = unique_count / len(df)
58
+
59
+ # Determine column type based on cardinality
60
+ if unique_count == 1:
61
+ col_type = "Constant"
62
+ elif unique_count == len(df):
63
+ col_type = "Unique Identifier"
64
+ elif unique_ratio < 0.05:
65
+ col_type = "Low Cardinality"
66
+ elif unique_ratio < 0.5:
67
+ col_type = "Medium Cardinality"
68
+ else:
69
+ col_type = "High Cardinality"
70
+
71
+ cardinality_data.append({
72
+ 'Column': col,
73
+ 'Unique Count': unique_count,
74
+ 'Unique Ratio': unique_ratio,
75
+ 'Type': col_type,
76
+ 'Data Type': str(df[col].dtype)
77
+ })
 
 
 
 
 
 
 
 
78
 
79
  return pd.DataFrame(cardinality_data)
80
 
81
  @st.cache_data
 
82
  def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
83
+ """Calculate memory optimization suggestions - cached"""
 
 
 
84
  suggestions = []
85
  current_memory = df.memory_usage(deep=True).sum() / 1024**2
86
  potential_savings = 0
87
 
88
  for col in df.columns:
89
+ if df[col].dtype == 'object':
90
+ unique_ratio = df[col].nunique() / len(df)
91
+ if unique_ratio < 0.5: # Less than 50% unique values
92
+ # Estimate category memory usage
93
+ category_memory = df[col].astype('category').memory_usage(deep=True)
94
+ object_memory = df[col].memory_usage(deep=True)
95
+ savings = (object_memory - category_memory) / 1024**2
96
 
97
+ if savings > 0.1: # More than 0.1MB savings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  suggestions.append({
99
+ 'column': col,
100
+ 'current_type': 'object',
101
+ 'suggested_type': 'category',
102
+ 'savings_mb': savings
 
103
  })
104
  potential_savings += savings
 
 
 
 
105
 
106
  return {
107
  'suggestions': suggestions,
108
+ 'current_memory_mb': current_memory,
109
+ 'potential_savings_mb': potential_savings,
110
+ 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
111
  }
112
 
113
  @st.cache_data
 
114
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
115
+ """Calculate missing data analysis - cached"""
 
 
 
116
  missing_data = df.isnull().sum()
 
117
  if missing_data.sum() > 0:
118
  missing_df = pd.DataFrame({
119
  'Column': missing_data.index,
120
  'Missing Count': missing_data.values,
121
+ 'Missing %': (missing_data.values / len(df)) * 100
 
122
  })
123
+ return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
 
 
 
124
  return pd.DataFrame()
125
 
126
  @st.cache_data
 
127
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
128
+ """Calculate correlation matrix - cached"""
 
 
 
129
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
130
+ return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  @st.cache_data
 
133
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
134
+ """Get column types - cached"""
135
+ return {
 
 
 
136
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
137
+ 'categorical': df.select_dtypes(include=['object']).columns.tolist(),
138
+ 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
 
139
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  @st.cache_data
142
+ def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
143
+ """Calculate enhanced numeric statistics - cached"""
 
 
 
 
 
 
 
144
  series = df[column].dropna()
145
+ return {
146
+ 'mean': series.mean(),
147
+ 'median': series.median(),
148
+ 'std': series.std(),
149
+ 'skewness': series.skew(),
150
+ 'kurtosis': series.kurtosis(),
151
+ 'min': series.min(),
152
+ 'max': series.max(),
153
+ 'q25': series.quantile(0.25),
154
+ 'q75': series.quantile(0.75)
155
+ }
156
+
157
+ @st.cache_data
158
+ def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
159
+ """Calculate outliers using IQR method - cached"""
160
+ Q1 = df[column].quantile(0.25)
161
+ Q3 = df[column].quantile(0.75)
162
+ IQR = Q3 - Q1
163
+ lower_bound = Q1 - 1.5 * IQR
164
+ upper_bound = Q3 + 1.5 * IQR
165
+
166
+ return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 
 
 
 
167
 
168
  @st.cache_data
 
169
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
170
+ """Detect columns with mixed data types - cached"""
 
 
 
171
  mixed_type_issues = []
172
 
173
  for col in df.select_dtypes(include=['object']).columns:
174
+ # Try to convert to numeric
175
+ numeric_conversion = pd.to_numeric(df[col], errors='coerce')
176
+ new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
177
+
178
+ if new_nulls > 0:
179
+ mixed_type_issues.append({
180
+ 'column': col,
181
+ 'problematic_values': new_nulls,
182
+ 'total_values': len(df[col]),
183
+ 'percentage': (new_nulls / len(df[col])) * 100
184
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  return mixed_type_issues
187
 
188
  @st.cache_data
189
+ def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
190
+ """Get value counts for categorical column - cached"""
 
 
 
 
191
  return df[column].value_counts().head(top_n)
192
 
193
  @st.cache_data
194
+ def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
195
+ """Calculate crosstab between two categorical columns - cached"""
196
+ return pd.crosstab(df[col1], df[col2])
197
+
198
+ @st.cache_data
199
+ def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
200
+ """Calculate group statistics - cached"""
201
+ return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 
 
 
 
 
 
 
 
 
 
202
 
203
  @st.cache_data
 
204
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
205
+ """Calculate overall data quality score - cached"""
 
 
 
206
  score = 100
207
  issues = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ # Missing values penalty
210
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
211
+ if missing_pct > 0:
212
+ penalty = min(30, missing_pct * 2) # Max 30 points penalty
213
+ score -= penalty
214
+ issues.append(f"Missing values: {missing_pct:.1f}%")
215
+
216
+ # Duplicates penalty
217
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
218
+ if duplicate_pct > 0:
219
+ penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
220
+ score -= penalty
221
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
222
+
223
+ # Constant columns penalty
224
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
225
+ if constant_cols:
226
+ penalty = min(10, len(constant_cols) * 2)
227
+ score -= penalty
228
+ issues.append(f"Constant columns: {len(constant_cols)}")
229
+
230
+ # Mixed types penalty
231
+ mixed_types = detect_mixed_types(df)
232
+ if mixed_types:
233
+ penalty = min(10, len(mixed_types) * 3)
234
+ score -= penalty
235
+ issues.append(f"Mixed type columns: {len(mixed_types)}")
236
 
237
+ return {
238
+ 'score': max(0, score),
239
+ 'issues': issues,
240
+ 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
241
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ def load_data(uploaded_file):
244
+ """Unified data loading function"""
245
+ file_content = uploaded_file.read()
246
+ uploaded_file.seek(0)
247
+
248
+ if uploaded_file.name.endswith('.csv'):
249
+ return load_csv_with_encoding(file_content, uploaded_file.name)
250
+ else:
251
+ return load_excel_file(file_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
254
+ """Apply data cleaning operations"""
 
 
 
 
255
  cleaned_df = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ for operation in operations:
258
+ if operation['type'] == 'fill_missing':
259
+ if operation['method'] == 'mean':
260
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
261
+ cleaned_df[operation['column']].mean())
262
+ elif operation['method'] == 'median':
263
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
264
+ cleaned_df[operation['column']].median())
265
+ elif operation['method'] == 'mode':
266
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
267
+ cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
268
+ elif operation['method'] == 'drop':
269
+ cleaned_df = cleaned_df.dropna(subset=[operation['column']])
270
+
271
+ elif operation['type'] == 'remove_duplicates':
272
+ cleaned_df = cleaned_df.drop_duplicates()
273
+
274
+ elif operation['type'] == 'remove_outliers':
275
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
276
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
277
+ IQR = Q3 - Q1
278
+ lower_bound = Q1 - 1.5 * IQR
279
+ upper_bound = Q3 + 1.5 * IQR
280
+ cleaned_df = cleaned_df[
281
+ (cleaned_df[operation['column']] >= lower_bound) &
282
+ (cleaned_df[operation['column']] <= upper_bound)
283
+ ]
284
+
285
+ elif operation['type'] == 'cap_outliers':
286
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
287
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
288
+ IQR = Q3 - Q1
289
+ lower_bound = Q1 - 1.5 * IQR
290
+ upper_bound = Q3 + 1.5 * IQR
291
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
292
+
293
+ elif operation['type'] == 'convert_type':
294
+ if operation['target_type'] == 'category':
295
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
296
+
297
+ return cleaned_df