""" Remove duplicates from clean_dataset.csv """ import pandas as pd # Load dataset df = pd.read_csv('data/processed/clean_dataset.csv') print(f"Original: {len(df):,} URLs") # Check duplicates print(f"Duplicates: {df.duplicated(subset='url').sum():,}") # Keep first occurrence of each URL df_clean = df.drop_duplicates(subset='url', keep='first') print(f"After removing duplicates: {len(df_clean):,} URLs") # Check label distribution print(f"\nLabel distribution:") print(df_clean['label'].value_counts()) # Save df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False) print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv")