Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| # Load the augmented dataset from the CSV file | |
| df = pd.read_csv("augmented_dataset.csv") | |
| size_before = len(df) | |
| # Remove duplicate entries based on the unique 'id' column, | |
| # retaining the most recently generated variant (keep='last') | |
| df_clean = df.drop_duplicates(subset=['id'], keep='last') | |
| size_after = len(df_clean) | |
| # Overwrite the existing file with the cleaned, deduplicated dataset | |
| df_clean.to_csv("augmented_dataset.csv", index=False, encoding="utf-8") | |
| # Output the results of the cleaning process | |
| print("Data cleaning process completed successfully.") | |
| print(f"Identified and removed {size_before - size_after} duplicate records.") | |
| print(f"Current deduplicated dataset size: {size_after} samples.") |