translators-will commited on
Commit
2d0c05d
·
verified ·
1 Parent(s): fb7ef82

Update data_clean_simple.py

Browse files
Files changed (1) hide show
  1. data_clean_simple.py +20 -17
data_clean_simple.py CHANGED
@@ -101,23 +101,26 @@ def clean_data(file_path, progress_callback=None):
101
 
102
  # Generate suggestions for missing or weird values
103
  if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
104
- # Get examples for analysis (both good and bad examples)
105
- # Get non-null, non-empty examples
106
- good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
107
-
108
- # Get bad examples
109
- bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
110
- na=False, case=False, regex=True)].sample(
111
- min(5, df[col].isna().sum()), random_state=1)
112
- # Combine good and bad examples
113
- examples = good_examples + bad_examples
114
-
115
- if examples is not None:
116
- suggestion = suggest_fill_strategies(col, examples)
117
- suggestions_log.append({
118
- 'col': col,
119
- 'suggestion': suggestion
120
- })
 
 
 
121
 
122
  # Final progress update
123
  if progress_callback:
 
101
 
102
  # Generate suggestions for missing or weird values
103
  if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
104
+ # Get non-null, non-empty examples for analysis - handle empty dataframes
105
+ try:
106
+ good_df = df[col][
107
+ df[col].notnull() &
108
+ (df[col].astype(str) != "") &
109
+ ~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown',
110
+ na=False, case=False, regex=True)
111
+ ]
112
+
113
+ if len(good_df) > 0:
114
+ sample_size = min(5, len(good_df))
115
+ good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
116
+ else:
117
+ good_examples = []
118
+ except:
119
+ good_examples = []
120
+
121
+ # Generate suggestions
122
+ suggestion = suggest_fill_strategies(col, good_examples)
123
+ suggestions_log.append((col, suggestion))
124
 
125
  # Final progress update
126
  if progress_callback: