entropy25 commited on
Commit
d3c3044
·
verified ·
1 Parent(s): fd475db

Update data_handler.py

Browse files
Files changed (1) hide show
  1. data_handler.py +121 -274
data_handler.py CHANGED
@@ -1,305 +1,152 @@
1
- """
2
- Data Analysis Platform
3
- Copyright (c) 2025 JEAN YOUNG
4
- All rights reserved.
5
-
6
- This software is proprietary and confidential.
7
- Unauthorized copying, distribution, or use is prohibited.
8
- """
9
  import streamlit as st
10
  import pandas as pd
11
  import numpy as np
12
  import warnings
13
- from typing import Dict, List, Any, Tuple
14
- from scipy import stats
15
  warnings.filterwarnings('ignore')
16
 
17
- # All cached data processing functions
18
  @st.cache_data
19
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
20
- """Load CSV with automatic encoding detection - cached"""
21
  import chardet
22
-
23
- detected = chardet.detect(file_content)
24
- encoding = detected['encoding']
25
 
26
  try:
27
- from io import BytesIO
 
 
 
 
28
  return pd.read_csv(BytesIO(file_content), encoding=encoding)
29
- except:
30
- encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
 
 
 
31
  for enc in encodings:
32
  try:
33
- return pd.read_csv(BytesIO(file_content), encoding=enc)
34
- except:
 
35
  continue
36
- raise Exception("Cannot read file with any encoding")
 
 
 
 
 
37
 
38
  @st.cache_data
39
- def load_excel_file(file_content: bytes) -> pd.DataFrame:
40
- """Load Excel file - cached"""
41
  from io import BytesIO
42
- return pd.read_excel(BytesIO(file_content))
43
-
44
- @st.cache_data
45
- def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
46
- """Calculate basic statistics - cached"""
47
- dtype_counts = df.dtypes.value_counts()
48
- dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
49
-
50
- return {
51
- 'shape': df.shape,
52
- 'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
53
- 'missing_values': int(df.isnull().sum().sum()),
54
- 'dtypes': dtype_dict,
55
- 'duplicates': int(df.duplicated().sum())
56
- }
57
-
58
- @st.cache_data
59
- def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
60
- """Calculate column cardinality analysis - cached"""
61
- cardinality_data = []
62
 
63
- for col in df.columns:
64
- unique_count = df[col].nunique()
65
- unique_ratio = unique_count / len(df)
 
 
 
66
 
67
- # Determine column type based on cardinality
68
- if unique_count == 1:
69
- col_type = "Constant"
70
- elif unique_count == len(df):
71
- col_type = "Unique Identifier"
72
- elif unique_ratio < 0.05:
73
- col_type = "Low Cardinality"
74
- elif unique_ratio < 0.5:
75
- col_type = "Medium Cardinality"
76
- else:
77
- col_type = "High Cardinality"
78
 
79
- cardinality_data.append({
80
- 'Column': col,
81
- 'Unique Count': unique_count,
82
- 'Unique Ratio': unique_ratio,
83
- 'Type': col_type,
84
- 'Data Type': str(df[col].dtype)
85
- })
86
-
87
- return pd.DataFrame(cardinality_data)
88
 
89
  @st.cache_data
90
- def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
91
- """Calculate memory optimization suggestions - cached"""
92
- suggestions = []
93
- current_memory = df.memory_usage(deep=True).sum() / 1024**2
94
- potential_savings = 0
95
-
96
- for col in df.columns:
97
- if df[col].dtype == 'object':
98
- unique_ratio = df[col].nunique() / len(df)
99
- if unique_ratio < 0.5: # Less than 50% unique values
100
- # Estimate category memory usage
101
- category_memory = df[col].astype('category').memory_usage(deep=True)
102
- object_memory = df[col].memory_usage(deep=True)
103
- savings = (object_memory - category_memory) / 1024**2
104
-
105
- if savings > 0.1: # More than 0.1MB savings
106
- suggestions.append({
107
- 'column': col,
108
- 'current_type': 'object',
109
- 'suggested_type': 'category',
110
- 'savings_mb': savings
111
- })
112
- potential_savings += savings
113
-
114
- return {
115
- 'suggestions': suggestions,
116
- 'current_memory_mb': current_memory,
117
- 'potential_savings_mb': potential_savings,
118
- 'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
119
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  @st.cache_data
122
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
123
- """Calculate missing data analysis - cached"""
124
- missing_data = df.isnull().sum()
125
- if missing_data.sum() > 0:
126
- missing_df = pd.DataFrame({
127
- 'Column': missing_data.index,
128
- 'Missing Count': missing_data.values,
129
- 'Missing %': (missing_data.values / len(df)) * 100
130
- })
131
- return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
132
- return pd.DataFrame()
133
-
134
- @st.cache_data
135
- def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
136
- """Calculate correlation matrix - cached"""
137
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
138
- return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
139
-
140
- @st.cache_data
141
- def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
142
- """Get column types - cached"""
143
- return {
144
- 'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
145
- 'categorical': df.select_dtypes(include=['object']).columns.tolist(),
146
- 'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
147
- }
148
-
149
- @st.cache_data
150
- def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
151
- """Calculate enhanced numeric statistics - cached"""
152
- series = df[column].dropna()
153
- return {
154
- 'mean': series.mean(),
155
- 'median': series.median(),
156
- 'std': series.std(),
157
- 'skewness': series.skew(),
158
- 'kurtosis': series.kurtosis(),
159
- 'min': series.min(),
160
- 'max': series.max(),
161
- 'q25': series.quantile(0.25),
162
- 'q75': series.quantile(0.75)
163
- }
164
-
165
- @st.cache_data
166
- def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
167
- """Calculate outliers using IQR method - cached"""
168
- Q1 = df[column].quantile(0.25)
169
- Q3 = df[column].quantile(0.75)
170
- IQR = Q3 - Q1
171
- lower_bound = Q1 - 1.5 * IQR
172
- upper_bound = Q3 + 1.5 * IQR
173
-
174
- return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
175
-
176
- @st.cache_data
177
- def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
178
- """Detect columns with mixed data types - cached"""
179
- mixed_type_issues = []
180
-
181
- for col in df.select_dtypes(include=['object']).columns:
182
- # Try to convert to numeric
183
- numeric_conversion = pd.to_numeric(df[col], errors='coerce')
184
- new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
185
 
186
- if new_nulls > 0:
187
- mixed_type_issues.append({
188
- 'column': col,
189
- 'problematic_values': new_nulls,
190
- 'total_values': len(df[col]),
191
- 'percentage': (new_nulls / len(df[col])) * 100
 
192
  })
193
-
194
- return mixed_type_issues
195
-
196
- @st.cache_data
197
- def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
198
- """Get value counts for categorical column - cached"""
199
- return df[column].value_counts().head(top_n)
200
-
201
- @st.cache_data
202
- def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
203
- """Calculate crosstab between two categorical columns - cached"""
204
- return pd.crosstab(df[col1], df[col2])
205
-
206
- @st.cache_data
207
- def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
208
- """Calculate group statistics - cached"""
209
- return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
210
 
211
  @st.cache_data
212
- def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
213
- """Calculate overall data quality score - cached"""
214
- score = 100
215
- issues = []
216
-
217
- # Missing values penalty
218
- missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
219
- if missing_pct > 0:
220
- penalty = min(30, missing_pct * 2) # Max 30 points penalty
221
- score -= penalty
222
- issues.append(f"Missing values: {missing_pct:.1f}%")
223
-
224
- # Duplicates penalty
225
- duplicate_pct = (df.duplicated().sum() / len(df)) * 100
226
- if duplicate_pct > 0:
227
- penalty = min(20, duplicate_pct * 4) # Max 20 points penalty
228
- score -= penalty
229
- issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
230
-
231
- # Constant columns penalty
232
- constant_cols = [col for col in df.columns if df[col].nunique() == 1]
233
- if constant_cols:
234
- penalty = min(10, len(constant_cols) * 2)
235
- score -= penalty
236
- issues.append(f"Constant columns: {len(constant_cols)}")
237
-
238
- # Mixed types penalty
239
- mixed_types = detect_mixed_types(df)
240
- if mixed_types:
241
- penalty = min(10, len(mixed_types) * 3)
242
- score -= penalty
243
- issues.append(f"Mixed type columns: {len(mixed_types)}")
244
-
245
- return {
246
- 'score': max(0, score),
247
- 'issues': issues,
248
- 'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
249
- }
250
-
251
- def load_data(uploaded_file):
252
- """Unified data loading function"""
253
- file_content = uploaded_file.read()
254
- uploaded_file.seek(0)
255
-
256
- if uploaded_file.name.endswith('.csv'):
257
- return load_csv_with_encoding(file_content, uploaded_file.name)
258
- else:
259
- return load_excel_file(file_content)
260
-
261
- def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
262
- """Apply data cleaning operations"""
263
- cleaned_df = df.copy()
264
-
265
- for operation in operations:
266
- if operation['type'] == 'fill_missing':
267
- if operation['method'] == 'mean':
268
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
269
- cleaned_df[operation['column']].mean())
270
- elif operation['method'] == 'median':
271
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
272
- cleaned_df[operation['column']].median())
273
- elif operation['method'] == 'mode':
274
- cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
275
- cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
276
- elif operation['method'] == 'drop':
277
- cleaned_df = cleaned_df.dropna(subset=[operation['column']])
278
-
279
- elif operation['type'] == 'remove_duplicates':
280
- cleaned_df = cleaned_df.drop_duplicates()
281
-
282
- elif operation['type'] == 'remove_outliers':
283
- Q1 = cleaned_df[operation['column']].quantile(0.25)
284
- Q3 = cleaned_df[operation['column']].quantile(0.75)
285
- IQR = Q3 - Q1
286
- lower_bound = Q1 - 1.5 * IQR
287
- upper_bound = Q3 + 1.5 * IQR
288
- cleaned_df = cleaned_df[
289
- (cleaned_df[operation['column']] >= lower_bound) &
290
- (cleaned_df[operation['column']] <= upper_bound)
291
- ]
292
-
293
- elif operation['type'] == 'cap_outliers':
294
- Q1 = cleaned_df[operation['column']].quantile(0.25)
295
- Q3 = cleaned_df[operation['column']].quantile(0.75)
296
- IQR = Q3 - Q1
297
- lower_bound = Q1 - 1.5 * IQR
298
- upper_bound = Q3 + 1.5 * IQR
299
- cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
300
 
301
- elif operation['type'] == 'convert_type':
302
- if operation['target_type'] == 'category':
303
- cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
304
-
305
- return cleaned_df
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import warnings
5
+ from typing import Dict, List, Any
 
6
  warnings.filterwarnings('ignore')
7
 
8
+ # Enhanced data processing functions with better error handling
9
  @st.cache_data
10
  def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
11
+ """Load CSV with automatic encoding detection and enhanced error handling"""
12
  import chardet
13
+ from io import BytesIO
 
 
14
 
15
  try:
16
+ # Detect encoding
17
+ detected = chardet.detect(file_content)
18
+ encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
19
+
20
+ # Try detected encoding first
21
  return pd.read_csv(BytesIO(file_content), encoding=encoding)
22
+
23
+ except Exception:
24
+ # Fallback encodings
25
+ encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
26
+
27
  for enc in encodings:
28
  try:
29
+ file_content_copy = BytesIO(file_content)
30
+ return pd.read_csv(file_content_copy, encoding=enc)
31
+ except Exception:
32
  continue
33
+
34
+ # Last resort - try with error handling
35
+ try:
36
+ return pd.read_csv(BytesIO(file_content), encoding='utf-8', errors='ignore')
37
+ except Exception as e:
38
+ raise Exception(f"Cannot read CSV file: {str(e)}")
39
 
40
  @st.cache_data
41
+ def load_excel_file(file_content: bytes, filename: str) -> pd.DataFrame:
42
+ """Load Excel file with enhanced error handling"""
43
  from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ try:
46
+ # Try loading Excel file
47
+ return pd.read_excel(BytesIO(file_content))
48
+ except Exception as e:
49
+ # Try different engines
50
+ engines = ['openpyxl', 'xlrd']
51
 
52
+ for engine in engines:
53
+ try:
54
+ return pd.read_excel(BytesIO(file_content), engine=engine)
55
+ except Exception:
56
+ continue
 
 
 
 
 
 
57
 
58
+ raise Exception(f"Cannot read Excel file: {str(e)}")
 
 
 
 
 
 
 
 
59
 
60
  @st.cache_data
61
+ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
62
+ """Calculate comprehensive basic statistics"""
63
+ try:
64
+ # Convert dtypes to string for JSON serialization
65
+ dtype_counts = df.dtypes.value_counts()
66
+ dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
67
+
68
+ # Calculate memory usage more accurately
69
+ memory_usage = df.memory_usage(deep=True).sum() / (1024**2) # MB
70
+
71
+ # Count missing values
72
+ missing_values = int(df.isnull().sum().sum())
73
+
74
+ # Count duplicates
75
+ duplicates = int(df.duplicated().sum())
76
+
77
+ # Additional statistics
78
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
79
+ categorical_cols = df.select_dtypes(include=['object']).columns
80
+
81
+ return {
82
+ 'shape': df.shape,
83
+ 'memory_usage': float(memory_usage),
84
+ 'missing_values': missing_values,
85
+ 'dtypes': dtype_dict,
86
+ 'duplicates': duplicates,
87
+ 'numeric_columns': len(numeric_cols),
88
+ 'categorical_columns': len(categorical_cols),
89
+ 'total_cells': df.shape[0] * df.shape[1],
90
+ 'completeness': ((df.shape[0] * df.shape[1] - missing_values) / (df.shape[0] * df.shape[1])) * 100
91
+ }
92
+
93
+ except Exception as e:
94
+ st.error(f"Error calculating basic statistics: {str(e)}")
95
+ return {
96
+ 'shape': (0, 0),
97
+ 'memory_usage': 0.0,
98
+ 'missing_values': 0,
99
+ 'dtypes': {},
100
+ 'duplicates': 0,
101
+ 'numeric_columns': 0,
102
+ 'categorical_columns': 0,
103
+ 'total_cells': 0,
104
+ 'completeness': 0.0
105
+ }
106
 
107
  @st.cache_data
108
  def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
109
+ """Calculate detailed missing data analysis"""
110
+ try:
111
+ missing_data = df.isnull().sum()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ if missing_data.sum() > 0:
114
+ missing_df = pd.DataFrame({
115
+ 'Column': missing_data.index,
116
+ 'Missing Count': missing_data.values,
117
+ 'Missing %': (missing_data.values / len(df)) * 100,
118
+ 'Data Type': [str(df[col].dtype) for col in missing_data.index],
119
+ 'Non-Missing Count': len(df) - missing_data.values
120
  })
121
+
122
+ # Sort by missing percentage (descending)
123
+ missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
124
+
125
+ # Add severity classification
126
+ missing_df['Severity'] = missing_df['Missing %'].apply(
127
+ lambda x: 'Critical' if x > 50 else 'High' if x > 20 else 'Medium' if x > 5 else 'Low'
128
+ )
129
+
130
+ return missing_df
131
+
132
+ return pd.DataFrame()
133
+
134
+ except Exception as e:
135
+ st.error(f"Error calculating missing data: {str(e)}")
136
+ return pd.DataFrame()
 
137
 
138
  @st.cache_data
139
+ def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
140
+ """Calculate correlation matrix with enhanced handling"""
141
+ try:
142
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ if len(numeric_cols) > 1:
145
+ # Remove columns with no variance (constant values)
146
+ variance_cols = []
147
+ for col in numeric_cols:
148
+ if df[col].var() > 0: # Only include columns with variance
149
+ variance_cols.append(col)
150
+
151
+ if len(variance_cols) > 1:
152
+ corr_matrix = df[variance_cols].corr()