entropy25 commited on
Commit
59db6f8
·
verified ·
1 Parent(s): e879f17

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +259 -322
data_handler.py CHANGED
@@ -2,359 +2,296 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
- from typing import Dict, List, Any
 
6
  warnings.filterwarnings('ignore')
7
 
 
8
  @st.cache_data
9
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
10
- """Load CSV with automatic encoding detection and enhanced error handling"""
11
  import chardet
12
- from io import BytesIO
 
 
13
 
14
  try:
15
- # Detect encoding
16
- detected = chardet.detect(file_content)
17
- encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
18
-
19
- # Try detected encoding first
20
  return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
-
22
- except Exception:
23
- # Fallback encodings
24
- encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
25
-
26
  for enc in encodings:
27
  try:
28
- file_content_copy = BytesIO(file_content)
29
- return pd.read_csv(file_content_copy, encoding=enc)
30
- except Exception:
31
  continue
32
-
33
- # Last resort - try with error handling
34
- try:
35
- return pd.read_csv(BytesIO(file_content), encoding='utf-8', errors='ignore')
36
- except Exception as e:
37
- raise Exception(f"Cannot read CSV file: {str(e)}")
38
 
39
  @st.cache_data
40
- def load_excel_file(file_content: bytes, filename: str) -> pd.DataFrame:
41
- """Load Excel file with enhanced error handling"""
42
  from io import BytesIO
43
-
44
- try:
45
- # Try loading Excel file
46
- return pd.read_excel(BytesIO(file_content))
47
- except Exception as e:
48
- # Try different engines
49
- engines = ['openpyxl', 'xlrd']
50
-
51
- for engine in engines:
52
- try:
53
- return pd.read_excel(BytesIO(file_content), engine=engine)
54
- except Exception:
55
- continue
56
-
57
- raise Exception(f"Cannot read Excel file: {str(e)}")
58
 
59
  @st.cache_data
60
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
61
- """Calculate comprehensive basic statistics"""
62
- try:
63
- # Convert dtypes to string for JSON serialization
64
- dtype_counts = df.dtypes.value_counts()
65
- dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
66
-
67
- # Calculate memory usage more accurately
68
- memory_usage = df.memory_usage(deep=True).sum() / (1024**2) # MB
69
-
70
- # Count missing values
71
- missing_values = int(df.isnull().sum().sum())
72
-
73
- # Count duplicates
74
- duplicates = int(df.duplicated().sum())
75
-
76
- # Additional statistics
77
- numeric_cols = df.select_dtypes(include=[np.number]).columns
78
- categorical_cols = df.select_dtypes(include=['object']).columns
79
-
80
- return {
81
- 'shape': df.shape,
82
- 'memory_usage': float(memory_usage),
83
- 'missing_values': missing_values,
84
- 'dtypes': dtype_dict,
85
- 'duplicates': duplicates,
86
- 'numeric_columns': len(numeric_cols),
87
- 'categorical_columns': len(categorical_cols),
88
- 'total_cells': df.shape[0] * df.shape[1],
89
- 'completeness': ((df.shape[0] * df.shape[1] - missing_values) / (df.shape[0] * df.shape[1])) * 100
90
- }
91
-
92
- except Exception as e:
93
- st.error(f"Error calculating basic statistics: {str(e)}")
94
- return {
95
- 'shape': (0, 0),
96
- 'memory_usage': 0.0,
97
- 'missing_values': 0,
98
- 'dtypes': {},
99
- 'duplicates': 0,
100
- 'numeric_columns': 0,
101
- 'categorical_columns': 0,
102
- 'total_cells': 0,
103
- 'completeness': 0.0
104
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  @st.cache_data
107
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
108
- """Calculate detailed missing data analysis"""
109
- try:
110
- missing_data = df.isnull().sum()
111
-
112
- if missing_data.sum() > 0:
113
- missing_df = pd.DataFrame({
114
- 'Column': missing_data.index,
115
- 'Missing Count': missing_data.values,
116
- 'Missing %': (missing_data.values / len(df)) * 100,
117
- 'Data Type': [str(df[col].dtype) for col in missing_data.index],
118
- 'Non-Missing Count': len(df) - missing_data.values
119
- })
120
-
121
- # Sort by missing percentage (descending)
122
- missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
123
-
124
- # Add severity classification
125
- missing_df['Severity'] = missing_df['Missing %'].apply(
126
- lambda x: 'Critical' if x > 50 else 'High' if x > 20 else 'Medium' if x > 5 else 'Low'
127
- )
128
-
129
- return missing_df
130
-
131
- return pd.DataFrame()
132
-
133
- except Exception as e:
134
- st.error(f"Error calculating missing data: {str(e)}")
135
- return pd.DataFrame()
136
 
137
  @st.cache_data
138
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
139
- """Calculate correlation matrix with enhanced handling"""
140
- try:
141
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
142
-
143
- if len(numeric_cols) > 1:
144
- # Remove columns with no variance (constant values)
145
- variance_cols = []
146
- for col in numeric_cols:
147
- if df[col].var() > 0: # Only include columns with variance
148
- variance_cols.append(col)
149
-
150
- if len(variance_cols) > 1:
151
- corr_matrix = df[variance_cols].corr()
152
- return corr_matrix
153
-
154
- return pd.DataFrame()
155
-
156
- except Exception as e:
157
- st.error(f"Error calculating correlation matrix: {str(e)}")
158
- return pd.DataFrame()
159
 
160
  @st.cache_data
161
- def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> Dict[str, Any]:
162
- """Detect outliers using IQR or Z-score method"""
163
- try:
164
- if column not in df.columns or not pd.api.types.is_numeric_dtype(df[column]):
165
- return {'outliers': [], 'bounds': {}, 'count': 0}
166
-
167
- data = df[column].dropna()
168
-
169
- if method == 'iqr':
170
- Q1 = data.quantile(0.25)
171
- Q3 = data.quantile(0.75)
172
- IQR = Q3 - Q1
173
- lower_bound = Q1 - 1.5 * IQR
174
- upper_bound = Q3 + 1.5 * IQR
175
-
176
- outliers = data[(data < lower_bound) | (data > upper_bound)]
177
- bounds = {'lower': lower_bound, 'upper': upper_bound, 'Q1': Q1, 'Q3': Q3}
178
-
179
- else: # z-score method
180
- z_scores = np.abs((data - data.mean()) / data.std())
181
- outliers = data[z_scores > 3]
182
- bounds = {'threshold': 3, 'mean': data.mean(), 'std': data.std()}
183
-
184
- return {
185
- 'outliers': outliers.tolist(),
186
- 'bounds': bounds,
187
- 'count': len(outliers),
188
- 'percentage': (len(outliers) / len(data)) * 100
189
- }
190
-
191
- except Exception as e:
192
- st.error(f"Error detecting outliers: {str(e)}")
193
- return {'outliers': [], 'bounds': {}, 'count': 0, 'percentage': 0}
194
 
195
  @st.cache_data
196
- def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
197
- """Calculate comprehensive data quality score"""
198
- try:
199
- # Initialize scores
200
- scores = {}
201
-
202
- # 1. Completeness (missing data)
203
- total_cells = df.shape[0] * df.shape[1]
204
- missing_cells = df.isnull().sum().sum()
205
- completeness = ((total_cells - missing_cells) / total_cells) * 100
206
- scores['completeness'] = completeness
207
-
208
- # 2. Uniqueness (duplicates)
209
- duplicate_rows = df.duplicated().sum()
210
- uniqueness = ((df.shape[0] - duplicate_rows) / df.shape[0]) * 100
211
- scores['uniqueness'] = uniqueness
212
-
213
- # 3. Consistency (data types)
214
- numeric_cols = df.select_dtypes(include=[np.number]).columns
215
- consistency_score = 100 # Start with perfect score
216
-
217
- for col in numeric_cols:
218
- # Check for mixed types (e.g., numbers stored as strings)
219
- non_null_data = df[col].dropna()
220
- if len(non_null_data) > 0:
221
- try:
222
- pd.to_numeric(non_null_data, errors='raise')
223
- except:
224
- consistency_score -= 10 # Penalty for inconsistent types
225
-
226
- scores['consistency'] = max(consistency_score, 0)
227
-
228
- # 4. Validity (basic checks)
229
- validity_score = 100
230
-
231
- # Check for extreme outliers in numeric columns
232
- for col in numeric_cols:
233
- outlier_info = detect_outliers(df, col)
234
- if outlier_info['percentage'] > 5: # More than 5% outliers
235
- validity_score -= 5
236
-
237
- scores['validity'] = max(validity_score, 0)
238
-
239
- # Overall quality score (weighted average)
240
- overall_score = (
241
- scores['completeness'] * 0.4 +
242
- scores['uniqueness'] * 0.3 +
243
- scores['consistency'] * 0.2 +
244
- scores['validity'] * 0.1
245
- )
246
-
247
- scores['overall'] = overall_score
248
-
249
- # Quality grade
250
- if overall_score >= 90:
251
- grade = 'Excellent'
252
- elif overall_score >= 80:
253
- grade = 'Good'
254
- elif overall_score >= 70:
255
- grade = 'Fair'
256
- elif overall_score >= 60:
257
- grade = 'Poor'
258
- else:
259
- grade = 'Critical'
260
-
261
- scores['grade'] = grade
262
-
263
- return scores
264
-
265
- except Exception as e:
266
- st.error(f"Error calculating data quality score: {str(e)}")
267
- return {
268
- 'completeness': 0,
269
- 'uniqueness': 0,
270
- 'consistency': 0,
271
- 'validity': 0,
272
- 'overall': 0,
273
- 'grade': 'Unknown'
274
- }
275
 
276
  @st.cache_data
277
- def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
278
- """Get columns categorized by their data types"""
279
- try:
280
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
281
- categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
282
- datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
283
- boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()
284
-
285
- return {
286
- 'numeric': numeric_cols,
287
- 'categorical': categorical_cols,
288
- 'datetime': datetime_cols,
289
- 'boolean': boolean_cols,
290
- 'all': df.columns.tolist()
291
- }
292
- except Exception as e:
293
- st.error(f"Error getting column types: {str(e)}")
294
- return {
295
- 'numeric': [],
296
- 'categorical': [],
297
- 'datetime': [],
298
- 'boolean': [],
299
- 'all': []
300
- }
301
 
302
  @st.cache_data
303
- def clean_data(df: pd.DataFrame, options: Dict[str, Any] = None) -> pd.DataFrame:
304
- """Clean data based on specified options"""
305
- try:
306
- cleaned_df = df.copy()
307
-
308
- if options is None:
309
- options = {}
310
-
311
- # Remove duplicates if specified
312
- if options.get('remove_duplicates', False):
313
- initial_rows = len(cleaned_df)
314
- cleaned_df = cleaned_df.drop_duplicates()
315
- removed_rows = initial_rows - len(cleaned_df)
316
- if removed_rows > 0:
317
- st.info(f"Removed {removed_rows} duplicate rows")
318
-
319
- # Handle missing values
320
- if options.get('handle_missing', False):
321
- missing_strategy = options.get('missing_strategy', 'drop')
322
-
323
- if missing_strategy == 'drop':
324
- cleaned_df = cleaned_df.dropna()
325
- elif missing_strategy == 'fill_mean':
326
- numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
327
- cleaned_df[numeric_cols] = cleaned_df[numeric_cols].fillna(cleaned_df[numeric_cols].mean())
328
- elif missing_strategy == 'fill_median':
329
- numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
330
- cleaned_df[numeric_cols] = cleaned_df[numeric_cols].fillna(cleaned_df[numeric_cols].median())
331
- elif missing_strategy == 'fill_mode':
332
- for col in cleaned_df.columns:
333
- if cleaned_df[col].isnull().any():
334
- mode_value = cleaned_df[col].mode()
335
- if not mode_value.empty:
336
- cleaned_df[col].fillna(mode_value[0], inplace=True)
337
-
338
- return cleaned_df
339
-
340
- except Exception as e:
341
- st.error(f"Error cleaning data: {str(e)}")
342
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  def load_data(uploaded_file):
345
- """Main function to load data - for compatibility"""
346
- try:
347
- file_extension = uploaded_file.name.split('.')[-1].lower()
348
- file_content = uploaded_file.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- if file_extension == 'csv':
351
- return load_csv_with_encoding(file_content, uploaded_file.name)
352
- elif file_extension in ['xlsx', 'xls']:
353
- return load_excel_file(file_content, uploaded_file.name)
354
- else:
355
- st.error("Unsupported file format")
356
- return None
357
-
358
- except Exception as e:
359
- st.error(f"Error loading file: {str(e)}")
360
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
+ from typing import Dict, List, Any, Tuple
6
+ from scipy import stats
7
  warnings.filterwarnings('ignore')
8
 
9
+ # All cached data processing functions
10
  @st.cache_data
11
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
12
+ """Load CSV with automatic encoding detection - cached"""
13
  import chardet
14
+
15
+ detected = chardet.detect(file_content)
16
+ encoding = detected['encoding']
17
 
18
  try:
19
+ from io import BytesIO
 
 
 
 
20
  return pd.read_csv(BytesIO(file_content), encoding=encoding)
21
+ except:
22
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
 
 
 
23
  for enc in encodings:
24
  try:
25
+ return pd.read_csv(BytesIO(file_content), encoding=enc)
26
+ except:
 
27
  continue
28
+ raise Exception("Cannot read file with any encoding")
 
 
 
 
 
29
 
30
  @st.cache_data
31
+ def load_excel_file(file_content: bytes) -> pd.DataFrame:
32
+ """Load Excel file - cached"""
33
  from io import BytesIO
34
+ return pd.read_excel(BytesIO(file_content))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  @st.cache_data
37
  def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
38
+ """Calculate basic statistics - cached"""
39
+ dtype_counts = df.dtypes.value_counts()
40
+ dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
41
+
42
+ return {
43
+ 'shape': df.shape,
44
+ 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
45
+ 'missing_values': int(df.isnull().sum().sum()),
46
+ 'dtypes': dtype_dict,
47
+ 'duplicates': int(df.duplicated().sum())
48
+ }
49
+
50
+ @st.cache_data
51
+ def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
52
+ """Calculate column cardinality analysis - cached"""
53
+ cardinality_data = []
54
+
55
+ for col in df.columns:
56
+ unique_count = df[col].nunique()
57
+ unique_ratio = unique_count / len(df)
58
+
59
+ # Determine column type based on cardinality
60
+ if unique_count == 1:
61
+ col_type = "Constant"
62
+ elif unique_count == len(df):
63
+ col_type = "Unique Identifier"
64
+ elif unique_ratio < 0.05:
65
+ col_type = "Low Cardinality"
66
+ elif unique_ratio < 0.5:
67
+ col_type = "Medium Cardinality"
68
+ else:
69
+ col_type = "High Cardinality"
70
+
71
+ cardinality_data.append({
72
+ 'Column': col,
73
+ 'Unique Count': unique_count,
74
+ 'Unique Ratio': unique_ratio,
75
+ 'Type': col_type,
76
+ 'Data Type': str(df[col].dtype)
77
+ })
78
+
79
+ return pd.DataFrame(cardinality_data)
80
+
81
+ @st.cache_data
82
+ def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
83
+ """Calculate memory optimization suggestions - cached"""
84
+ suggestions = []
85
+ current_memory = df.memory_usage(deep=True).sum() / 1024**2
86
+ potential_savings = 0
87
+
88
+ for col in df.columns:
89
+ if df[col].dtype == 'object':
90
+ unique_ratio = df[col].nunique() / len(df)
91
+ if unique_ratio < 0.5: # Less than 50% unique values
92
+ # Estimate category memory usage
93
+ category_memory = df[col].astype('category').memory_usage(deep=True)
94
+ object_memory = df[col].memory_usage(deep=True)
95
+ savings = (object_memory - category_memory) / 1024**2
96
+
97
+ if savings > 0.1: # More than 0.1MB savings
98
+ suggestions.append({
99
+ 'column': col,
100
+ 'current_type': 'object',
101
+ 'suggested_type': 'category',
102
+ 'savings_mb': savings
103
+ })
104
+ potential_savings += savings
105
+
106
+ return {
107
+ 'suggestions': suggestions,
108
+ 'current_memory_mb': current_memory,
109
+ 'potential_savings_mb': potential_savings,
110
+ 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
111
+ }
112
 
113
  @st.cache_data
114
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
115
+ """Calculate missing data analysis - cached"""
116
+ missing_data = df.isnull().sum()
117
+ if missing_data.sum() > 0:
118
+ missing_df = pd.DataFrame({
119
+ 'Column': missing_data.index,
120
+ 'Missing Count': missing_data.values,
121
+ 'Missing %': (missing_data.values / len(df)) * 100
122
+ })
123
+ return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
124
+ return pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  @st.cache_data
127
  def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
128
+ """Calculate correlation matrix - cached"""
129
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
130
+ return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  @st.cache_data
133
+ def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
134
+ """Get column types - cached"""
135
+ return {
136
+ 'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
137
+ 'categorical': df.select_dtypes(include=['object']).columns.tolist(),
138
+ 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
139
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  @st.cache_data
142
+ def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
143
+ """Calculate enhanced numeric statistics - cached"""
144
+ series = df[column].dropna()
145
+ return {
146
+ 'mean': series.mean(),
147
+ 'median': series.median(),
148
+ 'std': series.std(),
149
+ 'skewness': series.skew(),
150
+ 'kurtosis': series.kurtosis(),
151
+ 'min': series.min(),
152
+ 'max': series.max(),
153
+ 'q25': series.quantile(0.25),
154
+ 'q75': series.quantile(0.75)
155
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  @st.cache_data
158
+ def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
159
+ """Calculate outliers using IQR method - cached"""
160
+ Q1 = df[column].quantile(0.25)
161
+ Q3 = df[column].quantile(0.75)
162
+ IQR = Q3 - Q1
163
+ lower_bound = Q1 - 1.5 * IQR
164
+ upper_bound = Q3 + 1.5 * IQR
165
+
166
+ return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  @st.cache_data
169
+ def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
170
+ """Detect columns with mixed data types - cached"""
171
+ mixed_type_issues = []
172
+
173
+ for col in df.select_dtypes(include=['object']).columns:
174
+ # Try to convert to numeric
175
+ numeric_conversion = pd.to_numeric(df[col], errors='coerce')
176
+ new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
177
+
178
+ if new_nulls > 0:
179
+ mixed_type_issues.append({
180
+ 'column': col,
181
+ 'problematic_values': new_nulls,
182
+ 'total_values': len(df[col]),
183
+ 'percentage': (new_nulls / len(df[col])) * 100
184
+ })
185
+
186
+ return mixed_type_issues
187
+
188
+ @st.cache_data
189
+ def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
190
+ """Get value counts for categorical column - cached"""
191
+ return df[column].value_counts().head(top_n)
192
+
193
+ @st.cache_data
194
+ def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
195
+ """Calculate crosstab between two categorical columns - cached"""
196
+ return pd.crosstab(df[col1], df[col2])
197
+
198
+ @st.cache_data
199
+ def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
200
+ """Calculate group statistics - cached"""
201
+ return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
202
+
203
+ @st.cache_data
204
+ def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
205
+ """Calculate overall data quality score - cached"""
206
+ score = 100
207
+ issues = []
208
+
209
+ # Missing values penalty
210
+ missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
211
+ if missing_pct > 0:
212
+ penalty = min(30, missing_pct * 2) # Max 30 points penalty
213
+ score -= penalty
214
+ issues.append(f"Missing values: {missing_pct:.1f}%")
215
+
216
+ # Duplicates penalty
217
+ duplicate_pct = (df.duplicated().sum() / len(df)) * 100
218
+ if duplicate_pct > 0:
219
+ penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
220
+ score -= penalty
221
+ issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
222
+
223
+ # Constant columns penalty
224
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
225
+ if constant_cols:
226
+ penalty = min(10, len(constant_cols) * 2)
227
+ score -= penalty
228
+ issues.append(f"Constant columns: {len(constant_cols)}")
229
+
230
+ # Mixed types penalty
231
+ mixed_types = detect_mixed_types(df)
232
+ if mixed_types:
233
+ penalty = min(10, len(mixed_types) * 3)
234
+ score -= penalty
235
+ issues.append(f"Mixed type columns: {len(mixed_types)}")
236
+
237
+ return {
238
+ 'score': max(0, score),
239
+ 'issues': issues,
240
+ 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
241
+ }
242
 
243
  def load_data(uploaded_file):
244
+ """Unified data loading function"""
245
+ file_content = uploaded_file.read()
246
+ uploaded_file.seek(0)
247
+
248
+ if uploaded_file.name.endswith('.csv'):
249
+ return load_csv_with_encoding(file_content, uploaded_file.name)
250
+ else:
251
+ return load_excel_file(file_content)
252
+
253
+ def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
254
+ """Apply data cleaning operations"""
255
+ cleaned_df = df.copy()
256
+
257
+ for operation in operations:
258
+ if operation['type'] == 'fill_missing':
259
+ if operation['method'] == 'mean':
260
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
261
+ cleaned_df[operation['column']].mean())
262
+ elif operation['method'] == 'median':
263
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
264
+ cleaned_df[operation['column']].median())
265
+ elif operation['method'] == 'mode':
266
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
267
+ cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
268
+ elif operation['method'] == 'drop':
269
+ cleaned_df = cleaned_df.dropna(subset=[operation['column']])
270
+
271
+ elif operation['type'] == 'remove_duplicates':
272
+ cleaned_df = cleaned_df.drop_duplicates()
273
 
274
+ elif operation['type'] == 'remove_outliers':
275
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
276
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
277
+ IQR = Q3 - Q1
278
+ lower_bound = Q1 - 1.5 * IQR
279
+ upper_bound = Q3 + 1.5 * IQR
280
+ cleaned_df = cleaned_df[
281
+ (cleaned_df[operation['column']] >= lower_bound) &
282
+ (cleaned_df[operation['column']] <= upper_bound)
283
+ ]
284
+
285
+ elif operation['type'] == 'cap_outliers':
286
+ Q1 = cleaned_df[operation['column']].quantile(0.25)
287
+ Q3 = cleaned_df[operation['column']].quantile(0.75)
288
+ IQR = Q3 - Q1
289
+ lower_bound = Q1 - 1.5 * IQR
290
+ upper_bound = Q3 + 1.5 * IQR
291
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
292
+
293
+ elif operation['type'] == 'convert_type':
294
+ if operation['target_type'] == 'category':
295
+ cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
296
+
297
+ return cleaned_df