Phishing-Detection-System / scripts /utils /remove_duplicates.py
rb1337's picture
Upload 50 files
2cc7f91 verified
raw
history blame contribute delete
691 Bytes
"""
Remove duplicates from clean_dataset.csv
"""
import pandas as pd
# Load dataset
df = pd.read_csv('data/processed/clean_dataset.csv')
print(f"Original: {len(df):,} URLs")
# Check duplicates
print(f"Duplicates: {df.duplicated(subset='url').sum():,}")
# Keep first occurrence of each URL
df_clean = df.drop_duplicates(subset='url', keep='first')
print(f"After removing duplicates: {len(df_clean):,} URLs")
# Check label distribution
print(f"\nLabel distribution:")
print(df_clean['label'].value_counts())
# Save
df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False)
print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv")