import pandas as pd # Load the phishing data df_phish = pd.read_csv("data/url.csv") # Keep only verified and confirmed phishing entries df_phish = df_phish[df_phish['verified'] == 'yes'] # Extract only the URL df_clean = df_phish[['url']].copy() # Add label: 1 = phishing/spam df_clean['label'] = 1 print("✅ Cleaned phishing dataset shape:", df_clean.shape) print(df_clean.head()) # Save to CSV df_clean.to_csv("clean_phish_urls.csv", index=False)