Spaces:
Runtime error
Runtime error
File size: 691 Bytes
2cc7f91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | """
Remove duplicates from clean_dataset.csv
"""
import pandas as pd
# Load dataset
df = pd.read_csv('data/processed/clean_dataset.csv')
print(f"Original: {len(df):,} URLs")
# Check duplicates
print(f"Duplicates: {df.duplicated(subset='url').sum():,}")
# Keep first occurrence of each URL
df_clean = df.drop_duplicates(subset='url', keep='first')
print(f"After removing duplicates: {len(df_clean):,} URLs")
# Check label distribution
print(f"\nLabel distribution:")
print(df_clean['label'].value_counts())
# Save
df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False)
print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv") |