entropy25 commited on
Commit
ee51cad
·
verified ·
1 Parent(s): 78b8458

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +202 -673
data_handler.py CHANGED
@@ -4,53 +4,38 @@ import numpy as np
4
  import warnings
5
  from typing import Dict, List, Any, Tuple
6
  from scipy import stats
7
- import chardet
8
- from io import BytesIO
9
  warnings.filterwarnings('ignore')
10
 
11
- # HuggingFace optimized data processing functions with enhanced caching
12
-
13
- @st.cache_data(show_spinner=False)
14
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
15
- """Load CSV with automatic encoding detection - optimized for HF"""
 
 
 
 
16
 
17
- # Try to detect encoding
18
  try:
19
- detected = chardet.detect(file_content[:10000]) # Sample first 10KB for speed
20
- encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
21
  except:
22
- encoding = 'utf-8'
23
-
24
- # Try detected encoding first, then fallbacks
25
- encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
26
-
27
- for enc in encodings_to_try:
28
- try:
29
- return pd.read_csv(BytesIO(file_content), encoding=enc)
30
- except:
31
- continue
32
-
33
- raise Exception(f"Cannot read CSV file '{filename}' with any supported encoding")
34
 
35
- @st.cache_data(show_spinner=False)
36
  def load_excel_file(file_content: bytes) -> pd.DataFrame:
37
- """Load Excel file - optimized for HF"""
38
- try:
39
- return pd.read_excel(BytesIO(file_content))
40
- except Exception as e:
41
- raise Exception(f"Cannot read Excel file: {str(e)}")
42
 
43
- @st.cache_data(show_spinner=False)
44
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
45
- """Calculate basic statistics with performance optimization"""
46
-
47
- # Optimize for large datasets
48
- if len(df) > 100000:
49
- sample_df = df.sample(n=50000, random_state=42)
50
- st.info("📊 Using statistical sample for large dataset analysis")
51
- else:
52
- sample_df = df
53
-
54
  dtype_counts = df.dtypes.value_counts()
55
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
56
 
@@ -59,710 +44,254 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
59
  'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
60
  'missing_values': int(df.isnull().sum().sum()),
61
  'dtypes': dtype_dict,
62
- 'duplicates': int(df.duplicated().sum()),
63
- 'sample_used': len(sample_df) != len(df)
64
- }
65
-
66
- @st.cache_data(show_spinner=False)
67
- def calculate_enhanced_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
68
- """Calculate comprehensive quality score with business intelligence"""
69
-
70
- score = 100
71
- issues = []
72
- recommendations = []
73
- critical_issues = []
74
-
75
- # Missing values analysis (max -30 points)
76
- missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
77
- if missing_pct > 0:
78
- penalty = min(30, missing_pct * 1.5)
79
- score -= penalty
80
- issues.append(f"Missing values: {missing_pct:.1f}%")
81
-
82
- if missing_pct > 20:
83
- critical_issues.append("High missing value rate")
84
- recommendations.append("🚨 Critical: Review data collection processes")
85
- elif missing_pct > 5:
86
- recommendations.append("🔧 Apply intelligent filling strategies")
87
- else:
88
- recommendations.append("✅ Missing values within acceptable limits")
89
-
90
- # Duplicates analysis (max -25 points)
91
- duplicate_pct = (df.duplicated().sum() / len(df)) * 100
92
- if duplicate_pct > 0:
93
- penalty = min(25, duplicate_pct * 3)
94
- score -= penalty
95
- issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
96
-
97
- if duplicate_pct > 5:
98
- critical_issues.append("High duplication rate")
99
- recommendations.append("🚨 Investigate data collection pipeline")
100
- else:
101
- recommendations.append("🗑️ Remove duplicates before analysis")
102
-
103
- # Outliers analysis (max -20 points)
104
- numeric_cols = df.select_dtypes(include=[np.number]).columns
105
- total_outliers = 0
106
- problematic_cols = []
107
-
108
- for col in numeric_cols:
109
- try:
110
- Q1 = df[col].quantile(0.25)
111
- Q3 = df[col].quantile(0.75)
112
- IQR = Q3 - Q1
113
-
114
- if IQR > 0: # Avoid division by zero
115
- outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
116
- outlier_pct = (len(outliers) / len(df)) * 100
117
- total_outliers += len(outliers)
118
-
119
- if outlier_pct > 5:
120
- problematic_cols.append(col)
121
- except:
122
- continue
123
-
124
- if total_outliers > 0:
125
- outlier_overall_pct = (total_outliers / len(df)) * 100
126
- penalty = min(20, outlier_overall_pct * 2)
127
- score -= penalty
128
- issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
129
-
130
- if problematic_cols:
131
- recommendations.append(f"📊 Investigate outliers in: {', '.join(problematic_cols[:3])}")
132
-
133
- # Type consistency analysis (max -15 points)
134
- mixed_type_issues = detect_mixed_types(df)
135
- if mixed_type_issues:
136
- penalty = min(15, len(mixed_type_issues) * 5)
137
- score -= penalty
138
- issues.append(f"Type inconsistencies: {len(mixed_type_issues)} columns")
139
- recommendations.append("🔧 Standardize data types")
140
-
141
- # Constant columns analysis (max -10 points)
142
- constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
143
- if constant_cols:
144
- penalty = min(10, len(constant_cols) * 3)
145
- score -= penalty
146
- issues.append(f"Constant columns: {len(constant_cols)}")
147
- recommendations.append("🗑️ Remove uninformative columns")
148
-
149
- # Grade assignment
150
- if score >= 90:
151
- grade, color = "A", "#22c55e"
152
- elif score >= 80:
153
- grade, color = "B", "#3b82f6"
154
- elif score >= 70:
155
- grade, color = "C", "#f59e0b"
156
- elif score >= 60:
157
- grade, color = "D", "#f97316"
158
- else:
159
- grade, color = "F", "#ef4444"
160
-
161
- return {
162
- 'score': max(0, score),
163
- 'grade': grade,
164
- 'color': color,
165
- 'issues': issues,
166
- 'recommendations': recommendations,
167
- 'critical_issues': critical_issues,
168
- 'missing_pct': missing_pct,
169
- 'duplicate_pct': duplicate_pct,
170
- 'outlier_pct': (total_outliers / len(df)) * 100 if len(df) > 0 else 0,
171
- 'constant_cols': constant_cols,
172
- 'mixed_type_cols': len(mixed_type_issues)
173
  }
174
 
175
- @st.cache_data(show_spinner=False)
176
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
177
- """Enhanced column cardinality analysis with business intelligence"""
178
-
179
  cardinality_data = []
180
 
181
  for col in df.columns:
182
  unique_count = df[col].nunique()
183
- unique_ratio = unique_count / len(df) if len(df) > 0 else 0
184
- missing_count = df[col].isnull().sum()
185
- missing_pct = (missing_count / len(df)) * 100 if len(df) > 0 else 0
186
 
187
- # Enhanced type classification
188
  if unique_count == 1:
189
  col_type = "Constant"
190
- business_value = "None - Consider removal"
191
- elif unique_count == len(df) - missing_count:
192
  col_type = "Unique Identifier"
193
- business_value = "High - Key for joins"
194
- elif unique_ratio < 0.01:
195
- col_type = "Very Low Cardinality"
196
- business_value = "Medium - Good for flags"
197
  elif unique_ratio < 0.05:
198
  col_type = "Low Cardinality"
199
- business_value = "High - Perfect for grouping"
200
  elif unique_ratio < 0.5:
201
  col_type = "Medium Cardinality"
202
- business_value = "Medium - Use for segmentation"
203
  else:
204
  col_type = "High Cardinality"
205
- business_value = "Low - Avoid in group analysis"
206
-
207
- # Memory impact estimation
208
- if df[col].dtype == 'object' and unique_ratio < 0.5:
209
- category_memory = df[col].astype('category').memory_usage(deep=True)
210
- object_memory = df[col].memory_usage(deep=True)
211
- memory_savings = (object_memory - category_memory) / 1024**2
212
- memory_note = f"Save {memory_savings:.1f}MB with category type" if memory_savings > 0.1 else "Optimized"
213
- else:
214
- memory_note = "Optimized"
215
 
216
  cardinality_data.append({
217
  'Column': col,
218
  'Unique Count': unique_count,
219
  'Unique Ratio': unique_ratio,
220
- 'Missing %': missing_pct,
221
  'Type': col_type,
222
- 'Business Value': business_value,
223
- 'Data Type': str(df[col].dtype),
224
- 'Memory Note': memory_note
225
  })
226
 
227
  return pd.DataFrame(cardinality_data)
228
 
229
- @st.cache_data(show_spinner=False)
230
- def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
231
- """Enhanced missing data analysis with pattern detection"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
 
 
 
 
 
 
 
 
 
 
233
  missing_data = df.isnull().sum()
234
  if missing_data.sum() > 0:
235
  missing_df = pd.DataFrame({
236
  'Column': missing_data.index,
237
  'Missing Count': missing_data.values,
238
- 'Missing %': (missing_data.values / len(df)) * 100,
239
- 'Data Type': [str(df[col].dtype) for col in missing_data.index]
240
  })
241
-
242
- # Add severity classification
243
- def classify_severity(pct):
244
- if pct > 50:
245
- return "🚨 Critical"
246
- elif pct > 20:
247
- return "⚠️ High"
248
- elif pct > 5:
249
- return "🔸 Medium"
250
- else:
251
- return "🔹 Low"
252
-
253
- missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
254
-
255
- # Add AI suggestions
256
- def get_ai_suggestion(row):
257
- col_name = row['Column']
258
- missing_pct = row['Missing %']
259
- data_type = row['Data Type']
260
-
261
- if missing_pct > 50:
262
- return "Drop column - too many missing values"
263
- elif 'int' in data_type or 'float' in data_type:
264
- return "Fill with median (robust to outliers)"
265
- elif 'object' in data_type:
266
- return "Fill with mode (most frequent value)"
267
- else:
268
- return "Manual review recommended"
269
-
270
- missing_df['AI Suggestion'] = missing_df.apply(get_ai_suggestion, axis=1)
271
-
272
  return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
273
-
274
  return pd.DataFrame()
275
 
276
- @st.cache_data(show_spinner=False)
277
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
278
- """Calculate correlation matrix with performance optimization"""
279
-
280
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
281
-
282
- if len(numeric_cols) > 1:
283
- # Use sample for very large datasets
284
- if len(df) > 50000:
285
- sample_df = df[numeric_cols].sample(n=25000, random_state=42)
286
- else:
287
- sample_df = df[numeric_cols]
288
-
289
- return sample_df.corr()
290
-
291
- return pd.DataFrame()
292
 
293
- @st.cache_data(show_spinner=False)
294
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
295
- """Enhanced column type detection with business context"""
296
-
297
  return {
298
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
299
  'categorical': df.select_dtypes(include=['object']).columns.tolist(),
300
- 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
301
- 'boolean': df.select_dtypes(include=['bool']).columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
 
304
- @st.cache_data(show_spinner=False)
305
  def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
306
- """Enhanced outlier detection with business context"""
307
-
308
- try:
309
- Q1 = df[column].quantile(0.25)
310
- Q3 = df[column].quantile(0.75)
311
- IQR = Q3 - Q1
312
-
313
- if IQR == 0: # No variation in data
314
- return pd.DataFrame()
315
-
316
- lower_bound = Q1 - 1.5 * IQR
317
- upper_bound = Q3 + 1.5 * IQR
318
-
319
- outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
320
-
321
- # Add outlier context
322
- if not outliers.empty:
323
- outliers = outliers.copy()
324
- outliers['outlier_type'] = outliers[column].apply(
325
- lambda x: 'extreme_high' if x > upper_bound else 'extreme_low'
326
- )
327
- outliers['severity'] = outliers[column].apply(
328
- lambda x: abs(x - df[column].median()) / df[column].std() if df[column].std() > 0 else 0
329
- )
330
-
331
- return outliers
332
-
333
- except Exception as e:
334
- st.warning(f"Could not calculate outliers for '{column}': {str(e)}")
335
- return pd.DataFrame()
336
 
337
- @st.cache_data(show_spinner=False)
338
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
339
- """Enhanced mixed type detection with AI insights"""
340
-
341
  mixed_type_issues = []
342
 
343
  for col in df.select_dtypes(include=['object']).columns:
344
- try:
345
- # Try numeric conversion
346
- numeric_conversion = pd.to_numeric(df[col], errors='coerce')
347
- new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
348
-
349
- if new_nulls > 0:
350
- # Analyze the problematic values
351
- problematic_mask = pd.to_numeric(df[col], errors='coerce').isnull() & df[col].notnull()
352
- problematic_values = df.loc[problematic_mask, col].unique()[:5] # Top 5 examples
353
-
354
- mixed_type_issues.append({
355
- 'column': col,
356
- 'problematic_values': new_nulls,
357
- 'total_values': len(df[col]),
358
- 'percentage': (new_nulls / len(df[col])) * 100,
359
- 'examples': problematic_values.tolist(),
360
- 'suggestion': 'Convert to numeric with error handling' if new_nulls < len(df[col]) * 0.1 else 'Keep as text'
361
- })
362
- except:
363
- continue
364
 
365
  return mixed_type_issues
366
 
367
- @st.cache_data(show_spinner=False)
368
- def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
369
- """Enhanced memory optimization with detailed suggestions"""
370
-
371
- suggestions = []
372
- current_memory = df.memory_usage(deep=True).sum() / 1024**2
373
- potential_savings = 0
374
-
375
- for col in df.columns:
376
- col_memory = df[col].memory_usage(deep=True) / 1024**2
377
-
378
- if df[col].dtype == 'object':
379
- unique_ratio = df[col].nunique() / len(df)
380
-
381
- # Category optimization
382
- if unique_ratio < 0.5:
383
- try:
384
- category_memory = df[col].astype('category').memory_usage(deep=True) / 1024**2
385
- savings = col_memory - category_memory
386
-
387
- if savings > 0.1: # Significant savings
388
- suggestions.append({
389
- 'column': col,
390
- 'current_type': 'object',
391
- 'suggested_type': 'category',
392
- 'current_memory_mb': col_memory,
393
- 'optimized_memory_mb': category_memory,
394
- 'savings_mb': savings,
395
- 'savings_pct': (savings / col_memory) * 100
396
- })
397
- potential_savings += savings
398
- except:
399
- continue
400
-
401
- elif df[col].dtype == 'int64':
402
- # Integer downcast optimization
403
- col_min = df[col].min()
404
- col_max = df[col].max()
405
-
406
- if col_min >= 0: # Unsigned integers
407
- if col_max < 255:
408
- new_type = 'uint8'
409
- elif col_max < 65535:
410
- new_type = 'uint16'
411
- elif col_max < 4294967295:
412
- new_type = 'uint32'
413
- else:
414
- new_type = 'int64'
415
- else: # Signed integers
416
- if col_min >= -128 and col_max <= 127:
417
- new_type = 'int8'
418
- elif col_min >= -32768 and col_max <= 32767:
419
- new_type = 'int16'
420
- elif col_min >= -2147483648 and col_max <= 2147483647:
421
- new_type = 'int32'
422
- else:
423
- new_type = 'int64'
424
-
425
- if new_type != 'int64':
426
- try:
427
- optimized_memory = df[col].astype(new_type).memory_usage(deep=True) / 1024**2
428
- savings = col_memory - optimized_memory
429
-
430
- if savings > 0.1:
431
- suggestions.append({
432
- 'column': col,
433
- 'current_type': 'int64',
434
- 'suggested_type': new_type,
435
- 'current_memory_mb': col_memory,
436
- 'optimized_memory_mb': optimized_memory,
437
- 'savings_mb': savings,
438
- 'savings_pct': (savings / col_memory) * 100
439
- })
440
- potential_savings += savings
441
- except:
442
- continue
443
-
444
- return {
445
- 'suggestions': suggestions,
446
- 'current_memory_mb': current_memory,
447
- 'potential_savings_mb': potential_savings,
448
- 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0,
449
- 'optimization_available': len(suggestions) > 0
450
- }
451
-
452
- @st.cache_data(show_spinner=False)
453
  def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
454
- """Get value counts with performance optimization"""
455
-
456
- try:
457
- value_counts = df[column].value_counts()
458
-
459
- # Add percentage information
460
- value_counts_pct = (value_counts / len(df)) * 100
461
-
462
- return value_counts.head(top_n)
463
- except:
464
- return pd.Series()
465
 
466
- @st.cache_data(show_spinner=False)
 
 
 
 
 
467
  def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
468
- """Enhanced group statistics with business insights"""
469
-
470
- try:
471
- # Basic group statistics
472
- group_stats = df.groupby(group_col)[metric_col].agg([
473
- 'count', 'mean', 'median', 'std', 'min', 'max'
474
- ]).round(3)
475
-
476
- # Add business insights
477
- group_stats['cv'] = (group_stats['std'] / group_stats['mean']).round(3) # Coefficient of variation
478
- group_stats['range'] = group_stats['max'] - group_stats['min']
479
-
480
- # Sort by mean for better insights
481
- group_stats = group_stats.sort_values('mean', ascending=False)
482
-
483
- return group_stats
484
-
485
- except Exception as e:
486
- st.error(f"Error calculating group statistics: {str(e)}")
487
- return pd.DataFrame()
488
 
489
- @st.cache_data(show_spinner=False)
490
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
491
- """Backward compatibility wrapper"""
492
- return calculate_enhanced_quality_score(df)
493
-
494
- def load_data(uploaded_file) -> pd.DataFrame:
495
- """Enhanced data loading with better error handling for HuggingFace"""
496
-
497
- if uploaded_file is None:
498
- return None
499
-
500
- try:
501
- # Check file size (HuggingFace has limits)
502
- file_size_mb = len(uploaded_file.getvalue()) / 1024**2
503
-
504
- if file_size_mb > 200: # 200MB limit for HF
505
- st.error(f"File too large ({file_size_mb:.1f}MB). Please upload files under 200MB.")
506
- return None
507
-
508
- # Get file content
509
- file_content = uploaded_file.read()
510
- uploaded_file.seek(0) # Reset file pointer
511
-
512
- # Load based on file extension
513
- if uploaded_file.name.endswith('.csv'):
514
- df = load_csv_with_encoding(file_content, uploaded_file.name)
515
- elif uploaded_file.name.endswith(('.xlsx', '.xls')):
516
- df = load_excel_file(file_content)
517
- else:
518
- st.error("Unsupported file format. Please upload CSV or Excel files.")
519
- return None
520
-
521
- # Basic validation
522
- if df.empty:
523
- st.error("The uploaded file appears to be empty.")
524
- return None
525
-
526
- if len(df.columns) == 0:
527
- st.error("No columns detected in the file.")
528
- return None
529
-
530
- # Performance warning for large datasets
531
- if len(df) > 100000:
532
- st.warning(f"⚡ Large dataset detected ({len(df):,} rows). Some operations will use sampling for performance.")
533
-
534
- return df
535
-
536
- except Exception as e:
537
- st.error(f"Error loading file: {str(e)}")
538
- st.info("💡 **Troubleshooting Tips:**\n- Ensure CSV files are properly formatted\n- Check for special characters in Excel files\n- Try saving Excel as CSV first")
539
- return None
540
-
541
- def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Tuple[pd.DataFrame, List[str]]:
542
- """Apply comprehensive data cleaning operations with logging"""
543
-
544
- cleaned_df = df.copy()
545
- operation_log = []
546
-
547
- for operation in operations:
548
- try:
549
- if operation['type'] == 'fill_missing':
550
- col = operation['column']
551
- method = operation['method']
552
-
553
- if method == 'mean' and cleaned_df[col].dtype in ['int64', 'float64']:
554
- fill_value = cleaned_df[col].mean()
555
- cleaned_df[col] = cleaned_df[col].fillna(fill_value)
556
- operation_log.append(f"Filled missing values in '{col}' with mean ({fill_value:.2f})")
557
-
558
- elif method == 'median' and cleaned_df[col].dtype in ['int64', 'float64']:
559
- fill_value = cleaned_df[col].median()
560
- cleaned_df[col] = cleaned_df[col].fillna(fill_value)
561
- operation_log.append(f"Filled missing values in '{col}' with median ({fill_value:.2f})")
562
-
563
- elif method == 'mode':
564
- mode_values = cleaned_df[col].mode()
565
- if not mode_values.empty:
566
- fill_value = mode_values.iloc[0]
567
- cleaned_df[col] = cleaned_df[col].fillna(fill_value)
568
- operation_log.append(f"Filled missing values in '{col}' with mode ('{fill_value}')")
569
-
570
- elif method == 'drop':
571
- original_len = len(cleaned_df)
572
- cleaned_df = cleaned_df.dropna(subset=[col])
573
- removed = original_len - len(cleaned_df)
574
- operation_log.append(f"Dropped {removed} rows with missing values in '{col}'")
575
-
576
- elif operation['type'] == 'remove_duplicates':
577
- original_len = len(cleaned_df)
578
- cleaned_df = cleaned_df.drop_duplicates()
579
- removed = original_len - len(cleaned_df)
580
- if removed > 0:
581
- operation_log.append(f"Removed {removed} duplicate rows")
582
-
583
- elif operation['type'] == 'remove_outliers':
584
- col = operation['column']
585
- Q1 = cleaned_df[col].quantile(0.25)
586
- Q3 = cleaned_df[col].quantile(0.75)
587
- IQR = Q3 - Q1
588
- lower_bound = Q1 - 1.5 * IQR
589
- upper_bound = Q3 + 1.5 * IQR
590
-
591
- outliers = cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)]
592
- cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
593
- operation_log.append(f"Removed {len(outliers)} outliers from '{col}'")
594
-
595
- elif operation['type'] == 'cap_outliers':
596
- col = operation['column']
597
- Q1 = cleaned_df[col].quantile(0.25)
598
- Q3 = cleaned_df[col].quantile(0.75)
599
- IQR = Q3 - Q1
600
- lower_bound = Q1 - 1.5 * IQR
601
- upper_bound = Q3 + 1.5 * IQR
602
-
603
- original_outliers = len(cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)])
604
- cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
605
- operation_log.append(f"Capped {original_outliers} outliers in '{col}' to statistical bounds")
606
-
607
- elif operation['type'] == 'convert_type':
608
- col = operation['column']
609
- target_type = operation['target_type']
610
-
611
- if target_type == 'category':
612
- cleaned_df[col] = cleaned_df[col].astype('category')
613
- operation_log.append(f"Converted '{col}' to category type")
614
- elif target_type == 'numeric':
615
- cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
616
- operation_log.append(f"Converted '{col}' to numeric type")
617
-
618
- elif operation['type'] == 'drop_column':
619
- col = operation['column']
620
- cleaned_df = cleaned_df.drop(columns=[col])
621
- operation_log.append(f"Dropped column '{col}'")
622
-
623
- except Exception as e:
624
- operation_log.append(f"Failed to apply {operation['type']}: {str(e)}")
625
-
626
- return cleaned_df, operation_log
627
-
628
- # HuggingFace specific optimizations
629
-
630
- def optimize_dataframe_for_hf(df: pd.DataFrame) -> pd.DataFrame:
631
- """Apply HuggingFace specific optimizations"""
632
-
633
- optimized_df = df.copy()
634
-
635
- # Convert high-cardinality object columns to category
636
- for col in optimized_df.select_dtypes(include=['object']).columns:
637
- if optimized_df[col].nunique() / len(optimized_df) < 0.5:
638
- try:
639
- optimized_df[col] = optimized_df[col].astype('category')
640
- except:
641
- continue
642
 
643
- # Downcast numeric types for memory efficiency
644
- for col in optimized_df.select_dtypes(include=['int64']).columns:
645
- try:
646
- optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')
647
- except:
648
- continue
649
 
650
- for col in optimized_df.select_dtypes(include=['float64']).columns:
651
- try:
652
- optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float')
653
- except:
654
- continue
 
655
 
656
- return optimized_df
657
-
658
- @st.cache_data(show_spinner=False)
659
- def generate_sample_data() -> pd.DataFrame:
660
- """Generate sample dataset for demonstration"""
 
661
 
662
- np.random.seed(42)
663
- n_samples = 1000
 
 
 
 
664
 
665
- # Create realistic business dataset
666
- data = {
667
- 'customer_id': [f"CUST_{i:06d}" for i in range(1, n_samples + 1)],
668
- 'age': np.random.normal(35, 12, n_samples),
669
- 'annual_income': np.random.lognormal(10.5, 0.5, n_samples),
670
- 'credit_score': np.random.normal(650, 100, n_samples),
671
- 'account_balance': np.random.normal(5000, 3000, n_samples),
672
- 'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_samples),
673
- 'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_samples, p=[0.2, 0.5, 0.3]),
674
- 'is_active': np.random.choice([True, False], n_samples, p=[0.8, 0.2]),
675
- 'signup_date': pd.date_range('2020-01-01', periods=n_samples, freq='D')[:n_samples]
676
  }
677
-
678
- df = pd.DataFrame(data)
679
-
680
- # Inject realistic quality issues for demonstration
681
-
682
- # 1. Missing values in income (realistic - some customers don't disclose)
683
- missing_income_idx = np.random.choice(df.index, size=int(n_samples * 0.15), replace=False)
684
- df.loc[missing_income_idx, 'annual_income'] = np.nan
685
-
686
- # 2. Missing values in credit score (realistic - new customers)
687
- missing_credit_idx = np.random.choice(df.index, size=int(n_samples * 0.08), replace=False)
688
- df.loc[missing_credit_idx, 'credit_score'] = np.nan
689
-
690
- # 3. Outliers in age (data entry errors)
691
- outlier_age_idx = np.random.choice(df.index, size=25, replace=False)
692
- df.loc[outlier_age_idx, 'age'] = np.random.uniform(150, 999, 25) # Obvious errors
693
-
694
- # 4. Outliers in income (legitimate high earners + errors)
695
- outlier_income_idx = np.random.choice(df.index, size=30, replace=False)
696
- df.loc[outlier_income_idx, 'annual_income'] = np.random.uniform(500000, 2000000, 30)
697
-
698
- # 5. Negative account balances (overdrafts - realistic)
699
- negative_balance_idx = np.random.choice(df.index, size=50, replace=False)
700
- df.loc[negative_balance_idx, 'account_balance'] = np.random.uniform(-5000, -100, 50)
701
-
702
- # 6. Duplicate records (system errors)
703
- duplicate_records = df.sample(n=35).copy()
704
- df = pd.concat([df, duplicate_records], ignore_index=True)
705
-
706
- # 7. Mixed types in a column (add some text to numeric column)
707
- mixed_type_idx = np.random.choice(df.index, size=15, replace=False)
708
- df.loc[mixed_type_idx, 'credit_score'] = 'PENDING'
709
-
710
- return df
711
-
712
- # Additional utility functions for HuggingFace deployment
713
 
714
- def check_dataset_compatibility(df: pd.DataFrame) -> Dict[str, Any]:
715
- """Check if dataset is compatible with HuggingFace processing limits"""
716
-
717
- compatibility = {
718
- 'size_ok': True,
719
- 'memory_ok': True,
720
- 'columns_ok': True,
721
- 'warnings': [],
722
- 'recommendations': []
723
- }
724
-
725
- # Size checks
726
- if len(df) > 1000000: # 1M rows
727
- compatibility['size_ok'] = False
728
- compatibility['warnings'].append(f"Large dataset: {len(df):,} rows")
729
- compatibility['recommendations'].append("Consider sampling for interactive analysis")
730
-
731
- # Memory checks
732
- memory_mb = df.memory_usage(deep=True).sum() / 1024**2
733
- if memory_mb > 500: # 500MB
734
- compatibility['memory_ok'] = False
735
- compatibility['warnings'].append(f"High memory usage: {memory_mb:.1f}MB")
736
- compatibility['recommendations'].append("Apply memory optimization techniques")
737
 
738
- # Column count checks
739
- if len(df.columns) > 100:
740
- compatibility['columns_ok'] = False
741
- compatibility['warnings'].append(f"Many columns: {len(df.columns)}")
742
- compatibility['recommendations'].append("Focus analysis on key business columns")
743
-
744
- return compatibility
745
 
746
- def get_smart_sample(df: pd.DataFrame, target_size: int = 10000) -> pd.DataFrame:
747
- """Get intelligent sample that preserves data characteristics"""
748
-
749
- if len(df) <= target_size:
750
- return df
751
-
752
- # Stratified sampling if categorical columns exist
753
- categorical_cols = df.select_dtypes(include=['object', 'category']).columns
754
 
755
- if len(categorical_cols) > 0:
756
- # Use the first categorical column for stratification
757
- strat_col = categorical_cols[0]
758
- try:
759
- sample_df = df.groupby(strat_col, group_keys=False).apply(
760
- lambda x: x.sample(min(len(x), max(1, int(target_size * len(x) / len(df)))))
761
- )
762
- return sample_df.reset_index(drop=True)
763
- except:
764
- # Fall back to random sampling
765
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
- # Random sampling
768
- return df.sample(n=target_size, random_state=42).reset_index(drop=True)
 
4
  import warnings
5
  from typing import Dict, List, Any, Tuple
6
  from scipy import stats
 
 
7
  warnings.filterwarnings('ignore')
8
 
9
+ # All cached data processing functions
10
+ @st.cache_data
 
11
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
12
+ """Load CSV with automatic encoding detection - cached"""
13
+ import chardet
14
+
15
+ detected = chardet.detect(file_content)
16
+ encoding = detected['encoding']
17
 
 
18
  try:
19
+ from io import BytesIO
20
+ return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
  except:
22
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
23
+ for enc in encodings:
24
+ try:
25
+ return pd.read_csv(BytesIO(file_content), encoding=enc)
26
+ except:
27
+ continue
28
+ raise Exception("Cannot read file with any encoding")
 
 
 
 
 
29
 
30
+ @st.cache_data
31
  def load_excel_file(file_content: bytes) -> pd.DataFrame:
32
+ """Load Excel file - cached"""
33
+ from io import BytesIO
34
+ return pd.read_excel(BytesIO(file_content))
 
 
35
 
36
+ @st.cache_data
37
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
38
+ """Calculate basic statistics - cached"""
 
 
 
 
 
 
 
 
39
  dtype_counts = df.dtypes.value_counts()
40
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
41
 
 
44
  'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
45
  'missing_values': int(df.isnull().sum().sum()),
46
  'dtypes': dtype_dict,
47
+ 'duplicates': int(df.duplicated().sum())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  }
49
 
50
+ @st.cache_data
51
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
52
+ """Calculate column cardinality analysis - cached"""
 
53
  cardinality_data = []
54
 
55
  for col in df.columns:
56
  unique_count = df[col].nunique()
57
+ unique_ratio = unique_count / len(df)
 
 
58
 
59
+ # Determine column type based on cardinality
60
  if unique_count == 1:
61
  col_type = "Constant"
62
+ elif unique_count == len(df):
 
63
  col_type = "Unique Identifier"
 
 
 
 
64
  elif unique_ratio < 0.05:
65
  col_type = "Low Cardinality"
 
66
  elif unique_ratio < 0.5:
67
  col_type = "Medium Cardinality"
 
68
  else:
69
  col_type = "High Cardinality"
 
 
 
 
 
 
 
 
 
 
70
 
71
  cardinality_data.append({
72
  'Column': col,
73
  'Unique Count': unique_count,
74
  'Unique Ratio': unique_ratio,
 
75
  'Type': col_type,
76
+ 'Data Type': str(df[col].dtype)
 
 
77
  })
78
 
79
  return pd.DataFrame(cardinality_data)
80
 
81
+ @st.cache_data
82
+ def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
83
+ """Calculate memory optimization suggestions - cached"""
84
+ suggestions = []
85
+ current_memory = df.memory_usage(deep=True).sum() / 1024**2
86
+ potential_savings = 0
87
+
88
+ for col in df.columns:
89
+ if df[col].dtype == 'object':
90
+ unique_ratio = df[col].nunique() / len(df)
91
+ if unique_ratio < 0.5: # Less than 50% unique values
92
+ # Estimate category memory usage
93
+ category_memory = df[col].astype('category').memory_usage(deep=True)
94
+ object_memory = df[col].memory_usage(deep=True)
95
+ savings = (object_memory - category_memory) / 1024**2
96
+
97
+ if savings > 0.1: # More than 0.1MB savings
98
+ suggestions.append({
99
+ 'column': col,
100
+ 'current_type': 'object',
101
+ 'suggested_type': 'category',
102
+ 'savings_mb': savings
103
+ })
104
+ potential_savings += savings
105
 
106
+ return {
107
+ 'suggestions': suggestions,
108
+ 'current_memory_mb': current_memory,
109
+ 'potential_savings_mb': potential_savings,
110
+ 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
111
+ }
112
+
113
+ @st.cache_data
114
+ def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
115
+ """Calculate missing data analysis - cached"""
116
  missing_data = df.isnull().sum()
117
  if missing_data.sum() > 0:
118
  missing_df = pd.DataFrame({
119
  'Column': missing_data.index,
120
  'Missing Count': missing_data.values,
121
+ 'Missing %': (missing_data.values / len(df)) * 100
 
122
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
 
124
  return pd.DataFrame()
125
 
126
+ @st.cache_data
127
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
128
+ """Calculate correlation matrix - cached"""
 
129
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
130
+ return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
131
 
132
+ @st.cache_data
133
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
134
+ """Get column types - cached"""
 
135
  return {
136
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
137
  'categorical': df.select_dtypes(include=['object']).columns.tolist(),
138
+ 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
139
+ }
140
+
141
+ @st.cache_data
142
+ def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
143
+ """Calculate enhanced numeric statistics - cached"""
144
+ series = df[column].dropna()
145
+ return {
146
+ 'mean': series.mean(),
147
+ 'median': series.median(),
148
+ 'std': series.std(),
149
+ 'skewness': series.skew(),
150
+ 'kurtosis': series.kurtosis(),
151
+ 'min': series.min(),
152
+ 'max': series.max(),
153
+ 'q25': series.quantile(0.25),
154
+ 'q75': series.quantile(0.75)
155
  }
156
 
157
+ @st.cache_data
158
  def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
159
+ """Calculate outliers using IQR method - cached"""
160
+ Q1 = df[column].quantile(0.25)
161
+ Q3 = df[column].quantile(0.75)
162
+ IQR = Q3 - Q1
163
+ lower_bound = Q1 - 1.5 * IQR
164
+ upper_bound = Q3 + 1.5 * IQR
165
+
166
+ return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ @st.cache_data
169
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
170
+ """Detect columns with mixed data types - cached"""
 
171
  mixed_type_issues = []
172
 
173
  for col in df.select_dtypes(include=['object']).columns:
174
+ # Try to convert to numeric
175
+ numeric_conversion = pd.to_numeric(df[col], errors='coerce')
176
+ new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
177
+
178
+ if new_nulls > 0:
179
+ mixed_type_issues.append({
180
+ 'column': col,
181
+ 'problematic_values': new_nulls,
182
+ 'total_values': len(df[col]),
183
+ 'percentage': (new_nulls / len(df[col])) * 100
184
+ })
 
 
 
 
 
 
 
 
 
185
 
186
  return mixed_type_issues
187
 
188
+ @st.cache_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
190
+ """Get value counts for categorical column - cached"""
191
+ return df[column].value_counts().head(top_n)
 
 
 
 
 
 
 
 
 
192
 
193
+ @st.cache_data
194
+ def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
195
+ """Calculate crosstab between two categorical columns - cached"""
196
+ return pd.crosstab(df[col1], df[col2])
197
+
198
+ @st.cache_data
199
  def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
200
+ """Calculate group statistics - cached"""
201
+ return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
+ @st.cache_data
204
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
205
+ """Calculate overall data quality score - cached"""
206
+ score = 100
207
+ issues = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ # Missing values penalty
210
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
211
+ if missing_pct > 0:
212
+ penalty = min(30, missing_pct * 2) # Max 30 points penalty
213
+ score -= penalty
214
+ issues.append(f"Missing values: {missing_pct:.1f}%")
215
 
216
+ # Duplicates penalty
217
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
218
+ if duplicate_pct > 0:
219
+ penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
220
+ score -= penalty
221
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
222
 
223
+ # Constant columns penalty
224
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
225
+ if constant_cols:
226
+ penalty = min(10, len(constant_cols) * 2)
227
+ score -= penalty
228
+ issues.append(f"Constant columns: {len(constant_cols)}")
229
 
230
+ # Mixed types penalty
231
+ mixed_types = detect_mixed_types(df)
232
+ if mixed_types:
233
+ penalty = min(10, len(mixed_types) * 3)
234
+ score -= penalty
235
+ issues.append(f"Mixed type columns: {len(mixed_types)}")
236
 
237
+ return {
238
+ 'score': max(0, score),
239
+ 'issues': issues,
240
+ 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
 
 
 
 
 
 
 
241
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ def load_data(uploaded_file):
244
+ """Unified data loading function"""
245
+ file_content = uploaded_file.read()
246
+ uploaded_file.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ if uploaded_file.name.endswith('.csv'):
249
+ return load_csv_with_encoding(file_content, uploaded_file.name)
250
+ else:
251
+ return load_excel_file(file_content)
 
 
 
252
 
253
+ def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
254
+ """Apply data cleaning operations"""
255
+ cleaned_df = df.copy()
 
 
 
 
 
256
 
257
+ for operation in operations:
258
+ if operation['type'] == 'fill_missing':
259
+ if operation['method'] == 'mean':
260
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
261
+ cleaned_df[operation['column']].mean())
262
+ elif operation['method'] == 'median':
263
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
264
+ cleaned_df[operation['column']].median())
265
+ elif operation['method'] == 'mode':
266
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
267
+ cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
268
+ elif operation['method'] == 'drop':
269
+ cleaned_df = cleaned_df.dropna(subset=[operation['column']])
270
+
271
+ elif operation['type'] == 'remove_duplicates':
272
+ cleaned_df = cleaned_df.drop_duplicates()
273
+
274
+ elif operation['type'] == 'remove_outliers':
275
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
276
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
277
+ IQR = Q3 - Q1
278
+ lower_bound = Q1 - 1.5 * IQR
279
+ upper_bound = Q3 + 1.5 * IQR
280
+ cleaned_df = cleaned_df[
281
+ (cleaned_df[operation['column']] >= lower_bound) &
282
+ (cleaned_df[operation['column']] <= upper_bound)
283
+ ]
284
+
285
+ elif operation['type'] == 'cap_outliers':
286
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
287
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
288
+ IQR = Q3 - Q1
289
+ lower_bound = Q1 - 1.5 * IQR
290
+ upper_bound = Q3 + 1.5 * IQR
291
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
292
+
293
+ elif operation['type'] == 'convert_type':
294
+ if operation['target_type'] == 'category':
295
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
296
 
297
+ return cleaned_df