Spaces:
Sleeping
Sleeping
Update data_clean_simple.py
Browse files- data_clean_simple.py +13 -1
data_clean_simple.py
CHANGED
|
@@ -53,6 +53,7 @@ def suggest_fill_strategies(column_name, examples):
|
|
| 53 |
def clean_data(file_path):
|
| 54 |
# Support CSV and TSV files
|
| 55 |
# Load data and drop duplicates
|
|
|
|
| 56 |
if file_path.endswith('.tsv'):
|
| 57 |
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
|
| 58 |
else:
|
|
@@ -61,7 +62,12 @@ def clean_data(file_path):
|
|
| 61 |
suggestions_log = []
|
| 62 |
|
| 63 |
# Convert column types
|
| 64 |
-
for col in df.columns:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
if df[col].dtype == 'object':
|
| 66 |
df[col] = df[col].str.strip().str.lower() # Normalize text
|
| 67 |
|
|
@@ -111,6 +117,12 @@ def clean_data(file_path):
|
|
| 111 |
'suggestion': suggestion
|
| 112 |
})
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
df = df.reset_index(drop=True)
|
| 115 |
|
| 116 |
return df, suggestions_log
|
|
|
|
| 53 |
def clean_data(file_path):
|
| 54 |
# Support CSV and TSV files
|
| 55 |
# Load data and drop duplicates
|
| 56 |
+
# Clean data with progress updates
|
| 57 |
if file_path.endswith('.tsv'):
|
| 58 |
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
|
| 59 |
else:
|
|
|
|
| 62 |
suggestions_log = []
|
| 63 |
|
| 64 |
# Convert column types
|
| 65 |
+
for i, col in enumerate(df.columns):
|
| 66 |
+
# Update progress if callback provided
|
| 67 |
+
if progress_callback:
|
| 68 |
+
progress = i / total_columns
|
| 69 |
+
progress_callback(progress)
|
| 70 |
+
|
| 71 |
if df[col].dtype == 'object':
|
| 72 |
df[col] = df[col].str.strip().str.lower() # Normalize text
|
| 73 |
|
|
|
|
| 117 |
'suggestion': suggestion
|
| 118 |
})
|
| 119 |
|
| 120 |
+
# Final progress update
|
| 121 |
+
if progress_callback:
|
| 122 |
+
progress_callback(1.0)
|
| 123 |
+
|
| 124 |
+
# Reset index for consistency
|
| 125 |
+
|
| 126 |
df = df.reset_index(drop=True)
|
| 127 |
|
| 128 |
return df, suggestions_log
|