import pandas as pd # Load the augmented dataset from the CSV file df = pd.read_csv("augmented_dataset.csv") size_before = len(df) # Remove duplicate entries based on the unique 'id' column, # retaining the most recently generated variant (keep='last') df_clean = df.drop_duplicates(subset=['id'], keep='last') size_after = len(df_clean) # Overwrite the existing file with the cleaned, deduplicated dataset df_clean.to_csv("augmented_dataset.csv", index=False, encoding="utf-8") # Output the results of the cleaning process print("Data cleaning process completed successfully.") print(f"Identified and removed {size_before - size_after} duplicate records.") print(f"Current deduplicated dataset size: {size_after} samples.")