File size: 691 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
"""

Remove duplicates from clean_dataset.csv

"""
import pandas as pd

# Load dataset
df = pd.read_csv('data/processed/clean_dataset.csv')
print(f"Original: {len(df):,} URLs")

# Check duplicates
print(f"Duplicates: {df.duplicated(subset='url').sum():,}")

# Keep first occurrence of each URL
df_clean = df.drop_duplicates(subset='url', keep='first')
print(f"After removing duplicates: {len(df_clean):,} URLs")

# Check label distribution
print(f"\nLabel distribution:")
print(df_clean['label'].value_counts())

# Save
df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False)
print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv")