| import pandas as pd | |
| # Load the phishing data | |
| df_phish = pd.read_csv("data/url.csv") | |
| # Keep only verified and confirmed phishing entries | |
| df_phish = df_phish[df_phish['verified'] == 'yes'] | |
| # Extract only the URL | |
| df_clean = df_phish[['url']].copy() | |
| # Add label: 1 = phishing/spam | |
| df_clean['label'] = 1 | |
| print("✅ Cleaned phishing dataset shape:", df_clean.shape) | |
| print(df_clean.head()) | |
| # Save to CSV | |
| df_clean.to_csv("clean_phish_urls.csv", index=False) | |