entropy25 commited on
Commit
9cba783
·
verified ·
1 Parent(s): 7583e80

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +673 -202
data_handler.py CHANGED
@@ -4,38 +4,53 @@ import numpy as np
4
  import warnings
5
  from typing import Dict, List, Any, Tuple
6
  from scipy import stats
 
 
7
  warnings.filterwarnings('ignore')
8
 
9
- # All cached data processing functions
10
- @st.cache_data
 
11
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
12
- """Load CSV with automatic encoding detection - cached"""
13
- import chardet
14
-
15
- detected = chardet.detect(file_content)
16
- encoding = detected['encoding']
17
 
 
18
  try:
19
- from io import BytesIO
20
- return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
  except:
22
- encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
23
- for enc in encodings:
24
- try:
25
- return pd.read_csv(BytesIO(file_content), encoding=enc)
26
- except:
27
- continue
28
- raise Exception("Cannot read file with any encoding")
 
 
 
 
 
29
 
30
- @st.cache_data
31
  def load_excel_file(file_content: bytes) -> pd.DataFrame:
32
- """Load Excel file - cached"""
33
- from io import BytesIO
34
- return pd.read_excel(BytesIO(file_content))
 
 
35
 
36
- @st.cache_data
37
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
38
- """Calculate basic statistics - cached"""
 
 
 
 
 
 
 
 
39
  dtype_counts = df.dtypes.value_counts()
40
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
41
 
@@ -44,254 +59,710 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
44
  'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
45
  'missing_values': int(df.isnull().sum().sum()),
46
  'dtypes': dtype_dict,
47
- 'duplicates': int(df.duplicated().sum())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  }
49
 
50
- @st.cache_data
51
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
52
- """Calculate column cardinality analysis - cached"""
 
53
  cardinality_data = []
54
 
55
  for col in df.columns:
56
  unique_count = df[col].nunique()
57
- unique_ratio = unique_count / len(df)
 
 
58
 
59
- # Determine column type based on cardinality
60
  if unique_count == 1:
61
  col_type = "Constant"
62
- elif unique_count == len(df):
 
63
  col_type = "Unique Identifier"
 
 
 
 
64
  elif unique_ratio < 0.05:
65
  col_type = "Low Cardinality"
 
66
  elif unique_ratio < 0.5:
67
  col_type = "Medium Cardinality"
 
68
  else:
69
  col_type = "High Cardinality"
 
 
 
 
 
 
 
 
 
 
70
 
71
  cardinality_data.append({
72
  'Column': col,
73
  'Unique Count': unique_count,
74
  'Unique Ratio': unique_ratio,
 
75
  'Type': col_type,
76
- 'Data Type': str(df[col].dtype)
 
 
77
  })
78
 
79
  return pd.DataFrame(cardinality_data)
80
 
81
- @st.cache_data
82
- def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
83
- """Calculate memory optimization suggestions - cached"""
84
- suggestions = []
85
- current_memory = df.memory_usage(deep=True).sum() / 1024**2
86
- potential_savings = 0
87
-
88
- for col in df.columns:
89
- if df[col].dtype == 'object':
90
- unique_ratio = df[col].nunique() / len(df)
91
- if unique_ratio < 0.5: # Less than 50% unique values
92
- # Estimate category memory usage
93
- category_memory = df[col].astype('category').memory_usage(deep=True)
94
- object_memory = df[col].memory_usage(deep=True)
95
- savings = (object_memory - category_memory) / 1024**2
96
-
97
- if savings > 0.1: # More than 0.1MB savings
98
- suggestions.append({
99
- 'column': col,
100
- 'current_type': 'object',
101
- 'suggested_type': 'category',
102
- 'savings_mb': savings
103
- })
104
- potential_savings += savings
105
-
106
- return {
107
- 'suggestions': suggestions,
108
- 'current_memory_mb': current_memory,
109
- 'potential_savings_mb': potential_savings,
110
- 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
111
- }
112
-
113
- @st.cache_data
114
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
115
- """Calculate missing data analysis - cached"""
 
116
  missing_data = df.isnull().sum()
117
  if missing_data.sum() > 0:
118
  missing_df = pd.DataFrame({
119
  'Column': missing_data.index,
120
  'Missing Count': missing_data.values,
121
- 'Missing %': (missing_data.values / len(df)) * 100
 
122
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
 
124
  return pd.DataFrame()
125
 
126
- @st.cache_data
127
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
128
- """Calculate correlation matrix - cached"""
 
129
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
130
- return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
131
 
132
- @st.cache_data
133
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
134
- """Get column types - cached"""
 
135
  return {
136
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
137
  'categorical': df.select_dtypes(include=['object']).columns.tolist(),
138
- 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
139
- }
140
-
141
- @st.cache_data
142
- def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
143
- """Calculate enhanced numeric statistics - cached"""
144
- series = df[column].dropna()
145
- return {
146
- 'mean': series.mean(),
147
- 'median': series.median(),
148
- 'std': series.std(),
149
- 'skewness': series.skew(),
150
- 'kurtosis': series.kurtosis(),
151
- 'min': series.min(),
152
- 'max': series.max(),
153
- 'q25': series.quantile(0.25),
154
- 'q75': series.quantile(0.75)
155
  }
156
 
157
- @st.cache_data
158
  def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
159
- """Calculate outliers using IQR method - cached"""
160
- Q1 = df[column].quantile(0.25)
161
- Q3 = df[column].quantile(0.75)
162
- IQR = Q3 - Q1
163
- lower_bound = Q1 - 1.5 * IQR
164
- upper_bound = Q3 + 1.5 * IQR
165
-
166
- return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- @st.cache_data
169
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
170
- """Detect columns with mixed data types - cached"""
 
171
  mixed_type_issues = []
172
 
173
  for col in df.select_dtypes(include=['object']).columns:
174
- # Try to convert to numeric
175
- numeric_conversion = pd.to_numeric(df[col], errors='coerce')
176
- new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
177
-
178
- if new_nulls > 0:
179
- mixed_type_issues.append({
180
- 'column': col,
181
- 'problematic_values': new_nulls,
182
- 'total_values': len(df[col]),
183
- 'percentage': (new_nulls / len(df[col])) * 100
184
- })
 
 
 
 
 
 
 
 
 
185
 
186
  return mixed_type_issues
187
 
188
- @st.cache_data
189
- def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
190
- """Get value counts for categorical column - cached"""
191
- return df[column].value_counts().head(top_n)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
- @st.cache_data
194
- def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
195
- """Calculate crosstab between two categorical columns - cached"""
196
- return pd.crosstab(df[col1], df[col2])
 
 
 
 
 
 
 
 
 
197
 
198
- @st.cache_data
199
  def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
200
- """Calculate group statistics - cached"""
201
- return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- @st.cache_data
204
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
205
- """Calculate overall data quality score - cached"""
206
- score = 100
207
- issues = []
 
 
208
 
209
- # Missing values penalty
210
- missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
211
- if missing_pct > 0:
212
- penalty = min(30, missing_pct * 2) # Max 30 points penalty
213
- score -= penalty
214
- issues.append(f"Missing values: {missing_pct:.1f}%")
215
 
216
- # Duplicates penalty
217
- duplicate_pct = (df.duplicated().sum() / len(df)) * 100
218
- if duplicate_pct > 0:
219
- penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
220
- score -= penalty
221
- issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Constant columns penalty
224
- constant_cols = [col for col in df.columns if df[col].nunique() == 1]
225
- if constant_cols:
226
- penalty = min(10, len(constant_cols) * 2)
227
- score -= penalty
228
- issues.append(f"Constant columns: {len(constant_cols)}")
229
 
230
- # Mixed types penalty
231
- mixed_types = detect_mixed_types(df)
232
- if mixed_types:
233
- penalty = min(10, len(mixed_types) * 3)
234
- score -= penalty
235
- issues.append(f"Mixed type columns: {len(mixed_types)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- return {
238
- 'score': max(0, score),
239
- 'issues': issues,
240
- 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- def load_data(uploaded_file):
244
- """Unified data loading function"""
245
- file_content = uploaded_file.read()
246
- uploaded_file.seek(0)
247
 
248
- if uploaded_file.name.endswith('.csv'):
249
- return load_csv_with_encoding(file_content, uploaded_file.name)
250
- else:
251
- return load_excel_file(file_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
254
- """Apply data cleaning operations"""
255
- cleaned_df = df.copy()
256
 
257
- for operation in operations:
258
- if operation['type'] == 'fill_missing':
259
- if operation['method'] == 'mean':
260
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
261
- cleaned_df[operation['column']].mean())
262
- elif operation['method'] == 'median':
263
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
264
- cleaned_df[operation['column']].median())
265
- elif operation['method'] == 'mode':
266
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
267
- cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
268
- elif operation['method'] == 'drop':
269
- cleaned_df = cleaned_df.dropna(subset=[operation['column']])
270
-
271
- elif operation['type'] == 'remove_duplicates':
272
- cleaned_df = cleaned_df.drop_duplicates()
273
-
274
- elif operation['type'] == 'remove_outliers':
275
- Q1 = cleaned_df[operation['column']].quantile(0.25)
276
- Q3 = cleaned_df[operation['column']].quantile(0.75)
277
- IQR = Q3 - Q1
278
- lower_bound = Q1 - 1.5 * IQR
279
- upper_bound = Q3 + 1.5 * IQR
280
- cleaned_df = cleaned_df[
281
- (cleaned_df[operation['column']] >= lower_bound) &
282
- (cleaned_df[operation['column']] <= upper_bound)
283
- ]
284
-
285
- elif operation['type'] == 'cap_outliers':
286
- Q1 = cleaned_df[operation['column']].quantile(0.25)
287
- Q3 = cleaned_df[operation['column']].quantile(0.75)
288
- IQR = Q3 - Q1
289
- lower_bound = Q1 - 1.5 * IQR
290
- upper_bound = Q3 + 1.5 * IQR
291
- cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
292
-
293
- elif operation['type'] == 'convert_type':
294
- if operation['target_type'] == 'category':
295
- cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
296
 
297
- return cleaned_df
 
 
4
  import warnings
5
  from typing import Dict, List, Any, Tuple
6
  from scipy import stats
7
+ import chardet
8
+ from io import BytesIO
9
  warnings.filterwarnings('ignore')
10
 
11
+ # HuggingFace optimized data processing functions with enhanced caching
12
+
13
+ @st.cache_data(show_spinner=False)
14
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
15
+ """Load CSV with automatic encoding detection - optimized for HF"""
 
 
 
 
16
 
17
+ # Try to detect encoding
18
  try:
19
+ detected = chardet.detect(file_content[:10000]) # Sample first 10KB for speed
20
+ encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
21
  except:
22
+ encoding = 'utf-8'
23
+
24
+ # Try detected encoding first, then fallbacks
25
+ encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
26
+
27
+ for enc in encodings_to_try:
28
+ try:
29
+ return pd.read_csv(BytesIO(file_content), encoding=enc)
30
+ except:
31
+ continue
32
+
33
+ raise Exception(f"Cannot read CSV file '{filename}' with any supported encoding")
34
 
35
+ @st.cache_data(show_spinner=False)
36
  def load_excel_file(file_content: bytes) -> pd.DataFrame:
37
+ """Load Excel file - optimized for HF"""
38
+ try:
39
+ return pd.read_excel(BytesIO(file_content))
40
+ except Exception as e:
41
+ raise Exception(f"Cannot read Excel file: {str(e)}")
42
 
43
+ @st.cache_data(show_spinner=False)
44
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
45
+ """Calculate basic statistics with performance optimization"""
46
+
47
+ # Optimize for large datasets
48
+ if len(df) > 100000:
49
+ sample_df = df.sample(n=50000, random_state=42)
50
+ st.info("📊 Using statistical sample for large dataset analysis")
51
+ else:
52
+ sample_df = df
53
+
54
  dtype_counts = df.dtypes.value_counts()
55
  dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
56
 
 
59
  'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
60
  'missing_values': int(df.isnull().sum().sum()),
61
  'dtypes': dtype_dict,
62
+ 'duplicates': int(df.duplicated().sum()),
63
+ 'sample_used': len(sample_df) != len(df)
64
+ }
65
+
66
+ @st.cache_data(show_spinner=False)
67
+ def calculate_enhanced_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
68
+ """Calculate comprehensive quality score with business intelligence"""
69
+
70
+ score = 100
71
+ issues = []
72
+ recommendations = []
73
+ critical_issues = []
74
+
75
+ # Missing values analysis (max -30 points)
76
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
77
+ if missing_pct > 0:
78
+ penalty = min(30, missing_pct * 1.5)
79
+ score -= penalty
80
+ issues.append(f"Missing values: {missing_pct:.1f}%")
81
+
82
+ if missing_pct > 20:
83
+ critical_issues.append("High missing value rate")
84
+ recommendations.append("🚨 Critical: Review data collection processes")
85
+ elif missing_pct > 5:
86
+ recommendations.append("🔧 Apply intelligent filling strategies")
87
+ else:
88
+ recommendations.append("✅ Missing values within acceptable limits")
89
+
90
+ # Duplicates analysis (max -25 points)
91
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
92
+ if duplicate_pct > 0:
93
+ penalty = min(25, duplicate_pct * 3)
94
+ score -= penalty
95
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
96
+
97
+ if duplicate_pct > 5:
98
+ critical_issues.append("High duplication rate")
99
+ recommendations.append("🚨 Investigate data collection pipeline")
100
+ else:
101
+ recommendations.append("🗑️ Remove duplicates before analysis")
102
+
103
+ # Outliers analysis (max -20 points)
104
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
105
+ total_outliers = 0
106
+ problematic_cols = []
107
+
108
+ for col in numeric_cols:
109
+ try:
110
+ Q1 = df[col].quantile(0.25)
111
+ Q3 = df[col].quantile(0.75)
112
+ IQR = Q3 - Q1
113
+
114
+ if IQR > 0: # Avoid division by zero
115
+ outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
116
+ outlier_pct = (len(outliers) / len(df)) * 100
117
+ total_outliers += len(outliers)
118
+
119
+ if outlier_pct > 5:
120
+ problematic_cols.append(col)
121
+ except:
122
+ continue
123
+
124
+ if total_outliers > 0:
125
+ outlier_overall_pct = (total_outliers / len(df)) * 100
126
+ penalty = min(20, outlier_overall_pct * 2)
127
+ score -= penalty
128
+ issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
129
+
130
+ if problematic_cols:
131
+ recommendations.append(f"📊 Investigate outliers in: {', '.join(problematic_cols[:3])}")
132
+
133
+ # Type consistency analysis (max -15 points)
134
+ mixed_type_issues = detect_mixed_types(df)
135
+ if mixed_type_issues:
136
+ penalty = min(15, len(mixed_type_issues) * 5)
137
+ score -= penalty
138
+ issues.append(f"Type inconsistencies: {len(mixed_type_issues)} columns")
139
+ recommendations.append("🔧 Standardize data types")
140
+
141
+ # Constant columns analysis (max -10 points)
142
+ constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
143
+ if constant_cols:
144
+ penalty = min(10, len(constant_cols) * 3)
145
+ score -= penalty
146
+ issues.append(f"Constant columns: {len(constant_cols)}")
147
+ recommendations.append("🗑️ Remove uninformative columns")
148
+
149
+ # Grade assignment
150
+ if score >= 90:
151
+ grade, color = "A", "#22c55e"
152
+ elif score >= 80:
153
+ grade, color = "B", "#3b82f6"
154
+ elif score >= 70:
155
+ grade, color = "C", "#f59e0b"
156
+ elif score >= 60:
157
+ grade, color = "D", "#f97316"
158
+ else:
159
+ grade, color = "F", "#ef4444"
160
+
161
+ return {
162
+ 'score': max(0, score),
163
+ 'grade': grade,
164
+ 'color': color,
165
+ 'issues': issues,
166
+ 'recommendations': recommendations,
167
+ 'critical_issues': critical_issues,
168
+ 'missing_pct': missing_pct,
169
+ 'duplicate_pct': duplicate_pct,
170
+ 'outlier_pct': (total_outliers / len(df)) * 100 if len(df) > 0 else 0,
171
+ 'constant_cols': constant_cols,
172
+ 'mixed_type_cols': len(mixed_type_issues)
173
  }
174
 
175
+ @st.cache_data(show_spinner=False)
176
  def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
177
+ """Enhanced column cardinality analysis with business intelligence"""
178
+
179
  cardinality_data = []
180
 
181
  for col in df.columns:
182
  unique_count = df[col].nunique()
183
+ unique_ratio = unique_count / len(df) if len(df) > 0 else 0
184
+ missing_count = df[col].isnull().sum()
185
+ missing_pct = (missing_count / len(df)) * 100 if len(df) > 0 else 0
186
 
187
+ # Enhanced type classification
188
  if unique_count == 1:
189
  col_type = "Constant"
190
+ business_value = "None - Consider removal"
191
+ elif unique_count == len(df) - missing_count:
192
  col_type = "Unique Identifier"
193
+ business_value = "High - Key for joins"
194
+ elif unique_ratio < 0.01:
195
+ col_type = "Very Low Cardinality"
196
+ business_value = "Medium - Good for flags"
197
  elif unique_ratio < 0.05:
198
  col_type = "Low Cardinality"
199
+ business_value = "High - Perfect for grouping"
200
  elif unique_ratio < 0.5:
201
  col_type = "Medium Cardinality"
202
+ business_value = "Medium - Use for segmentation"
203
  else:
204
  col_type = "High Cardinality"
205
+ business_value = "Low - Avoid in group analysis"
206
+
207
+ # Memory impact estimation
208
+ if df[col].dtype == 'object' and unique_ratio < 0.5:
209
+ category_memory = df[col].astype('category').memory_usage(deep=True)
210
+ object_memory = df[col].memory_usage(deep=True)
211
+ memory_savings = (object_memory - category_memory) / 1024**2
212
+ memory_note = f"Save {memory_savings:.1f}MB with category type" if memory_savings > 0.1 else "Optimized"
213
+ else:
214
+ memory_note = "Optimized"
215
 
216
  cardinality_data.append({
217
  'Column': col,
218
  'Unique Count': unique_count,
219
  'Unique Ratio': unique_ratio,
220
+ 'Missing %': missing_pct,
221
  'Type': col_type,
222
+ 'Business Value': business_value,
223
+ 'Data Type': str(df[col].dtype),
224
+ 'Memory Note': memory_note
225
  })
226
 
227
  return pd.DataFrame(cardinality_data)
228
 
229
+ @st.cache_data(show_spinner=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
231
+ """Enhanced missing data analysis with pattern detection"""
232
+
233
  missing_data = df.isnull().sum()
234
  if missing_data.sum() > 0:
235
  missing_df = pd.DataFrame({
236
  'Column': missing_data.index,
237
  'Missing Count': missing_data.values,
238
+ 'Missing %': (missing_data.values / len(df)) * 100,
239
+ 'Data Type': [str(df[col].dtype) for col in missing_data.index]
240
  })
241
+
242
+ # Add severity classification
243
+ def classify_severity(pct):
244
+ if pct > 50:
245
+ return "🚨 Critical"
246
+ elif pct > 20:
247
+ return "⚠️ High"
248
+ elif pct > 5:
249
+ return "🔸 Medium"
250
+ else:
251
+ return "🔹 Low"
252
+
253
+ missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
254
+
255
+ # Add AI suggestions
256
+ def get_ai_suggestion(row):
257
+ col_name = row['Column']
258
+ missing_pct = row['Missing %']
259
+ data_type = row['Data Type']
260
+
261
+ if missing_pct > 50:
262
+ return "Drop column - too many missing values"
263
+ elif 'int' in data_type or 'float' in data_type:
264
+ return "Fill with median (robust to outliers)"
265
+ elif 'object' in data_type:
266
+ return "Fill with mode (most frequent value)"
267
+ else:
268
+ return "Manual review recommended"
269
+
270
+ missing_df['AI Suggestion'] = missing_df.apply(get_ai_suggestion, axis=1)
271
+
272
  return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
273
+
274
  return pd.DataFrame()
275
 
276
+ @st.cache_data(show_spinner=False)
277
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
278
+ """Calculate correlation matrix with performance optimization"""
279
+
280
  numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
281
+
282
+ if len(numeric_cols) > 1:
283
+ # Use sample for very large datasets
284
+ if len(df) > 50000:
285
+ sample_df = df[numeric_cols].sample(n=25000, random_state=42)
286
+ else:
287
+ sample_df = df[numeric_cols]
288
+
289
+ return sample_df.corr()
290
+
291
+ return pd.DataFrame()
292
 
293
+ @st.cache_data(show_spinner=False)
294
  def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
295
+ """Enhanced column type detection with business context"""
296
+
297
  return {
298
  'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
299
  'categorical': df.select_dtypes(include=['object']).columns.tolist(),
300
+ 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
301
+ 'boolean': df.select_dtypes(include=['bool']).columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
 
304
+ @st.cache_data(show_spinner=False)
305
  def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
306
+ """Enhanced outlier detection with business context"""
307
+
308
+ try:
309
+ Q1 = df[column].quantile(0.25)
310
+ Q3 = df[column].quantile(0.75)
311
+ IQR = Q3 - Q1
312
+
313
+ if IQR == 0: # No variation in data
314
+ return pd.DataFrame()
315
+
316
+ lower_bound = Q1 - 1.5 * IQR
317
+ upper_bound = Q3 + 1.5 * IQR
318
+
319
+ outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
320
+
321
+ # Add outlier context
322
+ if not outliers.empty:
323
+ outliers = outliers.copy()
324
+ outliers['outlier_type'] = outliers[column].apply(
325
+ lambda x: 'extreme_high' if x > upper_bound else 'extreme_low'
326
+ )
327
+ outliers['severity'] = outliers[column].apply(
328
+ lambda x: abs(x - df[column].median()) / df[column].std() if df[column].std() > 0 else 0
329
+ )
330
+
331
+ return outliers
332
+
333
+ except Exception as e:
334
+ st.warning(f"Could not calculate outliers for '{column}': {str(e)}")
335
+ return pd.DataFrame()
336
 
337
+ @st.cache_data(show_spinner=False)
338
  def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
339
+ """Enhanced mixed type detection with AI insights"""
340
+
341
  mixed_type_issues = []
342
 
343
  for col in df.select_dtypes(include=['object']).columns:
344
+ try:
345
+ # Try numeric conversion
346
+ numeric_conversion = pd.to_numeric(df[col], errors='coerce')
347
+ new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
348
+
349
+ if new_nulls > 0:
350
+ # Analyze the problematic values
351
+ problematic_mask = pd.to_numeric(df[col], errors='coerce').isnull() & df[col].notnull()
352
+ problematic_values = df.loc[problematic_mask, col].unique()[:5] # Top 5 examples
353
+
354
+ mixed_type_issues.append({
355
+ 'column': col,
356
+ 'problematic_values': new_nulls,
357
+ 'total_values': len(df[col]),
358
+ 'percentage': (new_nulls / len(df[col])) * 100,
359
+ 'examples': problematic_values.tolist(),
360
+ 'suggestion': 'Convert to numeric with error handling' if new_nulls < len(df[col]) * 0.1 else 'Keep as text'
361
+ })
362
+ except:
363
+ continue
364
 
365
  return mixed_type_issues
366
 
367
+ @st.cache_data(show_spinner=False)
368
+ def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
369
+ """Enhanced memory optimization with detailed suggestions"""
370
+
371
+ suggestions = []
372
+ current_memory = df.memory_usage(deep=True).sum() / 1024**2
373
+ potential_savings = 0
374
+
375
+ for col in df.columns:
376
+ col_memory = df[col].memory_usage(deep=True) / 1024**2
377
+
378
+ if df[col].dtype == 'object':
379
+ unique_ratio = df[col].nunique() / len(df)
380
+
381
+ # Category optimization
382
+ if unique_ratio < 0.5:
383
+ try:
384
+ category_memory = df[col].astype('category').memory_usage(deep=True) / 1024**2
385
+ savings = col_memory - category_memory
386
+
387
+ if savings > 0.1: # Significant savings
388
+ suggestions.append({
389
+ 'column': col,
390
+ 'current_type': 'object',
391
+ 'suggested_type': 'category',
392
+ 'current_memory_mb': col_memory,
393
+ 'optimized_memory_mb': category_memory,
394
+ 'savings_mb': savings,
395
+ 'savings_pct': (savings / col_memory) * 100
396
+ })
397
+ potential_savings += savings
398
+ except:
399
+ continue
400
+
401
+ elif df[col].dtype == 'int64':
402
+ # Integer downcast optimization
403
+ col_min = df[col].min()
404
+ col_max = df[col].max()
405
+
406
+ if col_min >= 0: # Unsigned integers
407
+ if col_max < 255:
408
+ new_type = 'uint8'
409
+ elif col_max < 65535:
410
+ new_type = 'uint16'
411
+ elif col_max < 4294967295:
412
+ new_type = 'uint32'
413
+ else:
414
+ new_type = 'int64'
415
+ else: # Signed integers
416
+ if col_min >= -128 and col_max <= 127:
417
+ new_type = 'int8'
418
+ elif col_min >= -32768 and col_max <= 32767:
419
+ new_type = 'int16'
420
+ elif col_min >= -2147483648 and col_max <= 2147483647:
421
+ new_type = 'int32'
422
+ else:
423
+ new_type = 'int64'
424
+
425
+ if new_type != 'int64':
426
+ try:
427
+ optimized_memory = df[col].astype(new_type).memory_usage(deep=True) / 1024**2
428
+ savings = col_memory - optimized_memory
429
+
430
+ if savings > 0.1:
431
+ suggestions.append({
432
+ 'column': col,
433
+ 'current_type': 'int64',
434
+ 'suggested_type': new_type,
435
+ 'current_memory_mb': col_memory,
436
+ 'optimized_memory_mb': optimized_memory,
437
+ 'savings_mb': savings,
438
+ 'savings_pct': (savings / col_memory) * 100
439
+ })
440
+ potential_savings += savings
441
+ except:
442
+ continue
443
+
444
+ return {
445
+ 'suggestions': suggestions,
446
+ 'current_memory_mb': current_memory,
447
+ 'potential_savings_mb': potential_savings,
448
+ 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0,
449
+ 'optimization_available': len(suggestions) > 0
450
+ }
451
 
452
+ @st.cache_data(show_spinner=False)
453
+ def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
454
+ """Get value counts with performance optimization"""
455
+
456
+ try:
457
+ value_counts = df[column].value_counts()
458
+
459
+ # Add percentage information
460
+ value_counts_pct = (value_counts / len(df)) * 100
461
+
462
+ return value_counts.head(top_n)
463
+ except:
464
+ return pd.Series()
465
 
466
+ @st.cache_data(show_spinner=False)
467
  def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
468
+ """Enhanced group statistics with business insights"""
469
+
470
+ try:
471
+ # Basic group statistics
472
+ group_stats = df.groupby(group_col)[metric_col].agg([
473
+ 'count', 'mean', 'median', 'std', 'min', 'max'
474
+ ]).round(3)
475
+
476
+ # Add business insights
477
+ group_stats['cv'] = (group_stats['std'] / group_stats['mean']).round(3) # Coefficient of variation
478
+ group_stats['range'] = group_stats['max'] - group_stats['min']
479
+
480
+ # Sort by mean for better insights
481
+ group_stats = group_stats.sort_values('mean', ascending=False)
482
+
483
+ return group_stats
484
+
485
+ except Exception as e:
486
+ st.error(f"Error calculating group statistics: {str(e)}")
487
+ return pd.DataFrame()
488
 
489
+ @st.cache_data(show_spinner=False)
490
  def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
491
+ """Backward compatibility wrapper"""
492
+ return calculate_enhanced_quality_score(df)
493
+
494
+ def load_data(uploaded_file) -> pd.DataFrame:
495
+ """Enhanced data loading with better error handling for HuggingFace"""
496
 
497
+ if uploaded_file is None:
498
+ return None
 
 
 
 
499
 
500
+ try:
501
+ # Check file size (HuggingFace has limits)
502
+ file_size_mb = len(uploaded_file.getvalue()) / 1024**2
503
+
504
+ if file_size_mb > 200: # 200MB limit for HF
505
+ st.error(f"File too large ({file_size_mb:.1f}MB). Please upload files under 200MB.")
506
+ return None
507
+
508
+ # Get file content
509
+ file_content = uploaded_file.read()
510
+ uploaded_file.seek(0) # Reset file pointer
511
+
512
+ # Load based on file extension
513
+ if uploaded_file.name.endswith('.csv'):
514
+ df = load_csv_with_encoding(file_content, uploaded_file.name)
515
+ elif uploaded_file.name.endswith(('.xlsx', '.xls')):
516
+ df = load_excel_file(file_content)
517
+ else:
518
+ st.error("Unsupported file format. Please upload CSV or Excel files.")
519
+ return None
520
+
521
+ # Basic validation
522
+ if df.empty:
523
+ st.error("The uploaded file appears to be empty.")
524
+ return None
525
+
526
+ if len(df.columns) == 0:
527
+ st.error("No columns detected in the file.")
528
+ return None
529
+
530
+ # Performance warning for large datasets
531
+ if len(df) > 100000:
532
+ st.warning(f"⚡ Large dataset detected ({len(df):,} rows). Some operations will use sampling for performance.")
533
+
534
+ return df
535
+
536
+ except Exception as e:
537
+ st.error(f"Error loading file: {str(e)}")
538
+ st.info("💡 **Troubleshooting Tips:**\n- Ensure CSV files are properly formatted\n- Check for special characters in Excel files\n- Try saving Excel as CSV first")
539
+ return None
540
+
541
+ def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Tuple[pd.DataFrame, List[str]]:
542
+ """Apply comprehensive data cleaning operations with logging"""
543
 
544
+ cleaned_df = df.copy()
545
+ operation_log = []
 
 
 
 
546
 
547
+ for operation in operations:
548
+ try:
549
+ if operation['type'] == 'fill_missing':
550
+ col = operation['column']
551
+ method = operation['method']
552
+
553
+ if method == 'mean' and cleaned_df[col].dtype in ['int64', 'float64']:
554
+ fill_value = cleaned_df[col].mean()
555
+ cleaned_df[col] = cleaned_df[col].fillna(fill_value)
556
+ operation_log.append(f"Filled missing values in '{col}' with mean ({fill_value:.2f})")
557
+
558
+ elif method == 'median' and cleaned_df[col].dtype in ['int64', 'float64']:
559
+ fill_value = cleaned_df[col].median()
560
+ cleaned_df[col] = cleaned_df[col].fillna(fill_value)
561
+ operation_log.append(f"Filled missing values in '{col}' with median ({fill_value:.2f})")
562
+
563
+ elif method == 'mode':
564
+ mode_values = cleaned_df[col].mode()
565
+ if not mode_values.empty:
566
+ fill_value = mode_values.iloc[0]
567
+ cleaned_df[col] = cleaned_df[col].fillna(fill_value)
568
+ operation_log.append(f"Filled missing values in '{col}' with mode ('{fill_value}')")
569
+
570
+ elif method == 'drop':
571
+ original_len = len(cleaned_df)
572
+ cleaned_df = cleaned_df.dropna(subset=[col])
573
+ removed = original_len - len(cleaned_df)
574
+ operation_log.append(f"Dropped {removed} rows with missing values in '{col}'")
575
+
576
+ elif operation['type'] == 'remove_duplicates':
577
+ original_len = len(cleaned_df)
578
+ cleaned_df = cleaned_df.drop_duplicates()
579
+ removed = original_len - len(cleaned_df)
580
+ if removed > 0:
581
+ operation_log.append(f"Removed {removed} duplicate rows")
582
+
583
+ elif operation['type'] == 'remove_outliers':
584
+ col = operation['column']
585
+ Q1 = cleaned_df[col].quantile(0.25)
586
+ Q3 = cleaned_df[col].quantile(0.75)
587
+ IQR = Q3 - Q1
588
+ lower_bound = Q1 - 1.5 * IQR
589
+ upper_bound = Q3 + 1.5 * IQR
590
+
591
+ outliers = cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)]
592
+ cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
593
+ operation_log.append(f"Removed {len(outliers)} outliers from '{col}'")
594
+
595
+ elif operation['type'] == 'cap_outliers':
596
+ col = operation['column']
597
+ Q1 = cleaned_df[col].quantile(0.25)
598
+ Q3 = cleaned_df[col].quantile(0.75)
599
+ IQR = Q3 - Q1
600
+ lower_bound = Q1 - 1.5 * IQR
601
+ upper_bound = Q3 + 1.5 * IQR
602
+
603
+ original_outliers = len(cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)])
604
+ cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
605
+ operation_log.append(f"Capped {original_outliers} outliers in '{col}' to statistical bounds")
606
+
607
+ elif operation['type'] == 'convert_type':
608
+ col = operation['column']
609
+ target_type = operation['target_type']
610
+
611
+ if target_type == 'category':
612
+ cleaned_df[col] = cleaned_df[col].astype('category')
613
+ operation_log.append(f"Converted '{col}' to category type")
614
+ elif target_type == 'numeric':
615
+ cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
616
+ operation_log.append(f"Converted '{col}' to numeric type")
617
+
618
+ elif operation['type'] == 'drop_column':
619
+ col = operation['column']
620
+ cleaned_df = cleaned_df.drop(columns=[col])
621
+ operation_log.append(f"Dropped column '{col}'")
622
+
623
+ except Exception as e:
624
+ operation_log.append(f"Failed to apply {operation['type']}: {str(e)}")
625
 
626
+ return cleaned_df, operation_log
627
+
628
+ # HuggingFace specific optimizations
629
+
630
+ def optimize_dataframe_for_hf(df: pd.DataFrame) -> pd.DataFrame:
631
+ """Apply HuggingFace specific optimizations"""
632
+
633
+ optimized_df = df.copy()
634
+
635
+ # Convert high-cardinality object columns to category
636
+ for col in optimized_df.select_dtypes(include=['object']).columns:
637
+ if optimized_df[col].nunique() / len(optimized_df) < 0.5:
638
+ try:
639
+ optimized_df[col] = optimized_df[col].astype('category')
640
+ except:
641
+ continue
642
+
643
+ # Downcast numeric types for memory efficiency
644
+ for col in optimized_df.select_dtypes(include=['int64']).columns:
645
+ try:
646
+ optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')
647
+ except:
648
+ continue
649
+
650
+ for col in optimized_df.select_dtypes(include=['float64']).columns:
651
+ try:
652
+ optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float')
653
+ except:
654
+ continue
655
+
656
+ return optimized_df
657
+
658
+ @st.cache_data(show_spinner=False)
659
+ def generate_sample_data() -> pd.DataFrame:
660
+ """Generate sample dataset for demonstration"""
661
+
662
+ np.random.seed(42)
663
+ n_samples = 1000
664
+
665
+ # Create realistic business dataset
666
+ data = {
667
+ 'customer_id': [f"CUST_{i:06d}" for i in range(1, n_samples + 1)],
668
+ 'age': np.random.normal(35, 12, n_samples),
669
+ 'annual_income': np.random.lognormal(10.5, 0.5, n_samples),
670
+ 'credit_score': np.random.normal(650, 100, n_samples),
671
+ 'account_balance': np.random.normal(5000, 3000, n_samples),
672
+ 'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_samples),
673
+ 'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_samples, p=[0.2, 0.5, 0.3]),
674
+ 'is_active': np.random.choice([True, False], n_samples, p=[0.8, 0.2]),
675
+ 'signup_date': pd.date_range('2020-01-01', periods=n_samples, freq='D')[:n_samples]
676
  }
677
+
678
+ df = pd.DataFrame(data)
679
+
680
+ # Inject realistic quality issues for demonstration
681
+
682
+ # 1. Missing values in income (realistic - some customers don't disclose)
683
+ missing_income_idx = np.random.choice(df.index, size=int(n_samples * 0.15), replace=False)
684
+ df.loc[missing_income_idx, 'annual_income'] = np.nan
685
+
686
+ # 2. Missing values in credit score (realistic - new customers)
687
+ missing_credit_idx = np.random.choice(df.index, size=int(n_samples * 0.08), replace=False)
688
+ df.loc[missing_credit_idx, 'credit_score'] = np.nan
689
+
690
+ # 3. Outliers in age (data entry errors)
691
+ outlier_age_idx = np.random.choice(df.index, size=25, replace=False)
692
+ df.loc[outlier_age_idx, 'age'] = np.random.uniform(150, 999, 25) # Obvious errors
693
+
694
+ # 4. Outliers in income (legitimate high earners + errors)
695
+ outlier_income_idx = np.random.choice(df.index, size=30, replace=False)
696
+ df.loc[outlier_income_idx, 'annual_income'] = np.random.uniform(500000, 2000000, 30)
697
+
698
+ # 5. Negative account balances (overdrafts - realistic)
699
+ negative_balance_idx = np.random.choice(df.index, size=50, replace=False)
700
+ df.loc[negative_balance_idx, 'account_balance'] = np.random.uniform(-5000, -100, 50)
701
+
702
+ # 6. Duplicate records (system errors)
703
+ duplicate_records = df.sample(n=35).copy()
704
+ df = pd.concat([df, duplicate_records], ignore_index=True)
705
+
706
+ # 7. Mixed types in a column (add some text to numeric column)
707
+ mixed_type_idx = np.random.choice(df.index, size=15, replace=False)
708
+ df.loc[mixed_type_idx, 'credit_score'] = 'PENDING'
709
+
710
+ return df
711
+
712
+ # Additional utility functions for HuggingFace deployment
713
 
714
+ def check_dataset_compatibility(df: pd.DataFrame) -> Dict[str, Any]:
715
+ """Check if dataset is compatible with HuggingFace processing limits"""
 
 
716
 
717
+ compatibility = {
718
+ 'size_ok': True,
719
+ 'memory_ok': True,
720
+ 'columns_ok': True,
721
+ 'warnings': [],
722
+ 'recommendations': []
723
+ }
724
+
725
+ # Size checks
726
+ if len(df) > 1000000: # 1M rows
727
+ compatibility['size_ok'] = False
728
+ compatibility['warnings'].append(f"Large dataset: {len(df):,} rows")
729
+ compatibility['recommendations'].append("Consider sampling for interactive analysis")
730
+
731
+ # Memory checks
732
+ memory_mb = df.memory_usage(deep=True).sum() / 1024**2
733
+ if memory_mb > 500: # 500MB
734
+ compatibility['memory_ok'] = False
735
+ compatibility['warnings'].append(f"High memory usage: {memory_mb:.1f}MB")
736
+ compatibility['recommendations'].append("Apply memory optimization techniques")
737
+
738
+ # Column count checks
739
+ if len(df.columns) > 100:
740
+ compatibility['columns_ok'] = False
741
+ compatibility['warnings'].append(f"Many columns: {len(df.columns)}")
742
+ compatibility['recommendations'].append("Focus analysis on key business columns")
743
+
744
+ return compatibility
745
 
746
+ def get_smart_sample(df: pd.DataFrame, target_size: int = 10000) -> pd.DataFrame:
747
+ """Get intelligent sample that preserves data characteristics"""
 
748
 
749
+ if len(df) <= target_size:
750
+ return df
751
+
752
+ # Stratified sampling if categorical columns exist
753
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
754
+
755
+ if len(categorical_cols) > 0:
756
+ # Use the first categorical column for stratification
757
+ strat_col = categorical_cols[0]
758
+ try:
759
+ sample_df = df.groupby(strat_col, group_keys=False).apply(
760
+ lambda x: x.sample(min(len(x), max(1, int(target_size * len(x) / len(df)))))
761
+ )
762
+ return sample_df.reset_index(drop=True)
763
+ except:
764
+ # Fall back to random sampling
765
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
 
767
+ # Random sampling
768
+ return df.sample(n=target_size, random_state=42).reset_index(drop=True)