Data_augmentation / backend /clean_data.py
Jacek Dusza
Initial commit: NLP Pipeline backend and React frontend
69a2c97
Raw
History Blame Contribute Delete
729 Bytes
import pandas as pd
# Load the augmented dataset from the CSV file
df = pd.read_csv("augmented_dataset.csv")
size_before = len(df)
# Remove duplicate entries based on the unique 'id' column,
# retaining the most recently generated variant (keep='last')
df_clean = df.drop_duplicates(subset=['id'], keep='last')
size_after = len(df_clean)
# Overwrite the existing file with the cleaned, deduplicated dataset
df_clean.to_csv("augmented_dataset.csv", index=False, encoding="utf-8")
# Output the results of the cleaning process
print("Data cleaning process completed successfully.")
print(f"Identified and removed {size_before - size_after} duplicate records.")
print(f"Current deduplicated dataset size: {size_after} samples.")