Spaces:
Sleeping
Sleeping
Upload clean.py
Browse files
clean.py
CHANGED
|
@@ -155,11 +155,11 @@ def clean_column(df, column_name):
|
|
| 155 |
|
| 156 |
# Convert column to determined data type
|
| 157 |
if data_type == "float":
|
| 158 |
-
df
|
| 159 |
elif data_type == "integer":
|
| 160 |
-
df
|
| 161 |
elif data_type == "date":
|
| 162 |
-
df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
|
| 163 |
elif data_type == "string" or data_type == "object":
|
| 164 |
# Transform string values
|
| 165 |
transform_result = transform_string_column(column_data, column_name)
|
|
@@ -178,8 +178,10 @@ def clean_column(df, column_name):
|
|
| 178 |
print(f" Potential typos found: {typo_result['typos']}")
|
| 179 |
|
| 180 |
# Set empty and invalid cells to NaN
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
| 184 |
return df, nonconforming_cells
|
| 185 |
|
|
|
|
| 155 |
|
| 156 |
# Convert column to determined data type
|
| 157 |
if data_type == "float":
|
| 158 |
+
df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
|
| 159 |
elif data_type == "integer":
|
| 160 |
+
df[column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
|
| 161 |
elif data_type == "date":
|
| 162 |
+
df[column_name] = pd.to_datetime(df[column_name], errors='coerce', dayfirst=True)
|
| 163 |
elif data_type == "string" or data_type == "object":
|
| 164 |
# Transform string values
|
| 165 |
transform_result = transform_string_column(column_data, column_name)
|
|
|
|
| 178 |
print(f" Potential typos found: {typo_result['typos']}")
|
| 179 |
|
| 180 |
# Set empty and invalid cells to NaN
|
| 181 |
+
indices_to_set_nan = set(empty_indices + invalid_indices)
|
| 182 |
+
existing_indices = df.index.intersection(indices_to_set_nan)
|
| 183 |
+
df.loc[existing_indices, column_name] = np.nan
|
| 184 |
+
nonconforming_cells = len(existing_indices)
|
| 185 |
|
| 186 |
return df, nonconforming_cells
|
| 187 |
|