entropy25 commited on
Commit
e595151
·
verified ·
1 Parent(s): 546a4bd

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +124 -1
data_handler.py CHANGED
@@ -149,4 +149,127 @@ def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
149
  variance_cols.append(col)
150
 
151
  if len(variance_cols) > 1:
152
- corr_matrix = df[variance_cols].corr()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  variance_cols.append(col)
150
 
151
  if len(variance_cols) > 1:
152
+ corr_matrix = df[variance_cols].corr()
153
+ return corr_matrix
154
+
155
+ return pd.DataFrame()
156
+
157
+ except Exception as e:
158
+ st.error(f"Error calculating correlation matrix: {str(e)}")
159
+ return pd.DataFrame()
160
+
161
+ @st.cache_data
162
+ def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> Dict[str, Any]:
163
+ """Detect outliers using IQR or Z-score method"""
164
+ try:
165
+ if column not in df.columns or not pd.api.types.is_numeric_dtype(df[column]):
166
+ return {'outliers': [], 'bounds': {}, 'count': 0}
167
+
168
+ data = df[column].dropna()
169
+
170
+ if method == 'iqr':
171
+ Q1 = data.quantile(0.25)
172
+ Q3 = data.quantile(0.75)
173
+ IQR = Q3 - Q1
174
+ lower_bound = Q1 - 1.5 * IQR
175
+ upper_bound = Q3 + 1.5 * IQR
176
+
177
+ outliers = data[(data < lower_bound) | (data > upper_bound)]
178
+ bounds = {'lower': lower_bound, 'upper': upper_bound, 'Q1': Q1, 'Q3': Q3}
179
+
180
+ else: # z-score method
181
+ z_scores = np.abs((data - data.mean()) / data.std())
182
+ outliers = data[z_scores > 3]
183
+ bounds = {'threshold': 3, 'mean': data.mean(), 'std': data.std()}
184
+
185
+ return {
186
+ 'outliers': outliers.tolist(),
187
+ 'bounds': bounds,
188
+ 'count': len(outliers),
189
+ 'percentage': (len(outliers) / len(data)) * 100
190
+ }
191
+
192
+ except Exception as e:
193
+ st.error(f"Error detecting outliers: {str(e)}")
194
+ return {'outliers': [], 'bounds': {}, 'count': 0, 'percentage': 0}
195
+
196
+ @st.cache_data
197
+ def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
198
+ """Calculate comprehensive data quality score"""
199
+ try:
200
+ # Initialize scores
201
+ scores = {}
202
+
203
+ # 1. Completeness (missing data)
204
+ total_cells = df.shape[0] * df.shape[1]
205
+ missing_cells = df.isnull().sum().sum()
206
+ completeness = ((total_cells - missing_cells) / total_cells) * 100
207
+ scores['completeness'] = completeness
208
+
209
+ # 2. Uniqueness (duplicates)
210
+ duplicate_rows = df.duplicated().sum()
211
+ uniqueness = ((df.shape[0] - duplicate_rows) / df.shape[0]) * 100
212
+ scores['uniqueness'] = uniqueness
213
+
214
+ # 3. Consistency (data types)
215
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
216
+ consistency_score = 100 # Start with perfect score
217
+
218
+ for col in numeric_cols:
219
+ # Check for mixed types (e.g., numbers stored as strings)
220
+ non_null_data = df[col].dropna()
221
+ if len(non_null_data) > 0:
222
+ try:
223
+ pd.to_numeric(non_null_data, errors='raise')
224
+ except:
225
+ consistency_score -= 10 # Penalty for inconsistent types
226
+
227
+ scores['consistency'] = max(consistency_score, 0)
228
+
229
+ # 4. Validity (basic checks)
230
+ validity_score = 100
231
+
232
+ # Check for extreme outliers in numeric columns
233
+ for col in numeric_cols:
234
+ outlier_info = detect_outliers(df, col)
235
+ if outlier_info['percentage'] > 5: # More than 5% outliers
236
+ validity_score -= 5
237
+
238
+ scores['validity'] = max(validity_score, 0)
239
+
240
+ # Overall quality score (weighted average)
241
+ overall_score = (
242
+ scores['completeness'] * 0.4 +
243
+ scores['uniqueness'] * 0.3 +
244
+ scores['consistency'] * 0.2 +
245
+ scores['validity'] * 0.1
246
+ )
247
+
248
+ scores['overall'] = overall_score
249
+
250
+ # Quality grade
251
+ if overall_score >= 90:
252
+ grade = 'Excellent'
253
+ elif overall_score >= 80:
254
+ grade = 'Good'
255
+ elif overall_score >= 70:
256
+ grade = 'Fair'
257
+ elif overall_score >= 60:
258
+ grade = 'Poor'
259
+ else:
260
+ grade = 'Critical'
261
+
262
+ scores['grade'] = grade
263
+
264
+ return scores
265
+
266
+ except Exception as e:
267
+ st.error(f"Error calculating data quality score: {str(e)}")
268
+ return {
269
+ 'completeness': 0,
270
+ 'uniqueness': 0,
271
+ 'consistency': 0,
272
+ 'validity': 0,
273
+ 'overall': 0,
274
+ 'grade': 'Unknown'
275
+ }