import pandas as pd

# Load the augmented dataset from the CSV file
df = pd.read_csv("augmented_dataset.csv")
size_before = len(df)

# Remove duplicate entries based on the unique 'id' column, 
# retaining the most recently generated variant (keep='last')
df_clean = df.drop_duplicates(subset=['id'], keep='last')
size_after = len(df_clean)

# Overwrite the existing file with the cleaned, deduplicated dataset
df_clean.to_csv("augmented_dataset.csv", index=False, encoding="utf-8")

# Output the results of the cleaning process
print("Data cleaning process completed successfully.")
print(f"Identified and removed {size_before - size_after} duplicate records.")
print(f"Current deduplicated dataset size: {size_after} samples.")