Spaces:
Runtime error
Runtime error
| """ | |
| Remove duplicates from clean_dataset.csv | |
| """ | |
| import pandas as pd | |
| # Load dataset | |
| df = pd.read_csv('data/processed/clean_dataset.csv') | |
| print(f"Original: {len(df):,} URLs") | |
| # Check duplicates | |
| print(f"Duplicates: {df.duplicated(subset='url').sum():,}") | |
| # Keep first occurrence of each URL | |
| df_clean = df.drop_duplicates(subset='url', keep='first') | |
| print(f"After removing duplicates: {len(df_clean):,} URLs") | |
| # Check label distribution | |
| print(f"\nLabel distribution:") | |
| print(df_clean['label'].value_counts()) | |
| # Save | |
| df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False) | |
| print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv") |