spam-detection-app / generate_eval_data.py
premmm's picture
Upload folder using huggingface_hub
9930208 verified
Raw
History Blame Contribute Delete
5.73 kB
import pandas as pd
import random
def generate_data():
categories = {
"english": {
"spam": [
"Congratulations! You've won a $1000 Walmart gift card. Click here to claim.",
"URGENT: Your account has been compromised. Verify your details now.",
"Get rich quick! Work from home and earn $5000 a week. Join now.",
"Free entry into our $100,000 prize draw. Text WIN to 88888.",
"Investment opportunity: Bitcoin prices are surging. Buy now for 10x returns."
],
"ham": [
"Hi John, are we still on for the meeting at 2 PM?",
"Can you please send over the report by end of day?",
"The weather looks great for the weekend hike.",
"Dinner is ready, let me know when you're home.",
"Thanks for the help with the project earlier."
]
},
"french": {
"spam": [
"Félicitations ! Vous avez gagné une carte-cadeau de 1000 €. Cliquez ici.",
"URGENT : Votre compte a été compromis. Vérifiez vos coordonnées maintenant.",
"Devenez riche rapidement ! Travaillez à domicile et gagnez 5000 € par semaine.",
"Entrée gratuite à notre tirage au sort de 100 000 €. Envoyez GAGNER au 88888.",
"Opportunité d'investissement : les prix du Bitcoin s'envolent."
],
"ham": [
"Salut Jean, on se voit toujours pour la réunion à 14h ?",
"Peux-tu m'envoyer le rapport avant la fin de la journée ?",
"Il fera beau pour la randonnée ce week-end.",
"Le dîner est prêt, dis-moi quand tu rentres.",
"Merci pour l'aide sur le projet tout à l'heure."
]
},
"dutch": {
"spam": [
"Gefeliciteerd! U heeft een cadeaubon van €1000 gewonnen. Klik hier.",
"DRINGEND: Uw account is gecompromitteerd. Verifieer nu uw gegevens.",
"Word snel rijk! Werk vanuit huis en verdien €5000 per week. Doe nu mee.",
"Gratis deelname aan onze prijstrekking van €100.000. Sms WIN naar 88888.",
"Investeringsmogelijkheid: Bitcoin-prijzen stijgen enorm."
],
"ham": [
"Hoi Jan, gaat de vergadering om 14:00 uur nog door?",
"Kun je het rapport voor het einde van de dag opsturen?",
"Het weer ziet er goed uit voor de wandeling dit weekend.",
"Het eten is klaar, laat maar weten wanneer je thuis bent.",
"Bedankt voor de hulp bij het project vanmiddag."
]
},
"german": {
"spam": [
"Herzlichen Glückwunsch! Sie haben einen 1000€ Gutschein gewonnen. Hier klicken.",
"DRINGEND: Ihr Konto wurde kompromittiert. Verifizieren Sie jetzt Ihre Daten.",
"Schnell reich werden! Arbeiten Sie von zu Hause und verdienen Sie 5000€ pro Woche.",
"Kostenlose Teilnahme an unserer 100.000€ Verlosung. Sende GEWINN an 88888.",
"Investmentmöglichkeit: Bitcoin-Preise steigen rasant."
],
"ham": [
"Hallo Jan, steht unser Termin um 14 Uhr noch?",
"Kannst du mir den Bericht bitte bis Ende des Tages schicken?",
"Das Wetter sieht gut aus für die Wanderung am Wochenende.",
"Das Abendessen ist fertig, sag Bescheid, wenn du zu Hause bist.",
"Danke für die Hilfe beim Projekt vorhin."
]
},
"phishing": {
"spam": [
"Action Required: Your PayPal account has been restricted. Sign in to fix.",
"Microsoft Security Alert: Unusual sign-in activity detected on your account.",
"Netflix: Your payment was declined. Update your billing information immediately.",
"Your Amazon order #123-456789-0 is delayed. Click to track.",
"Apple ID: Someone tried to sign in to your account from Russia."
],
"ham": [
"Your weekly screen time report is ready.",
"Password change confirmation for your account.",
"A new device signed into your Google account.",
"Order confirmation: Thank you for your purchase.",
"Welcome to our newsletter! Please confirm your email."
]
}
}
all_data = []
for cat, content in categories.items():
# 80 spam, 20 ham
spam_samples = [random.choice(content["spam"]) for _ in range(80)]
ham_samples = [random.choice(content["ham"]) for _ in range(20)]
for text in spam_samples:
all_data.append({"text": text, "label": 1, "category": cat})
for text in ham_samples:
all_data.append({"text": text, "label": 0, "category": cat})
# Mixed category: take from all others
mixed_data = []
for _ in range(80):
c = random.choice(list(categories.keys()))
mixed_data.append({"text": random.choice(categories[c]["spam"]), "label": 1, "category": "mixed"})
for _ in range(20):
c = random.choice(list(categories.keys()))
mixed_data.append({"text": random.choice(categories[c]["ham"]), "label": 0, "category": "mixed"})
all_data.extend(mixed_data)
df = pd.DataFrame(all_data)
df.to_csv("eval_dataset.csv", index=False)
print("Dataset generated: eval_dataset.csv")
if __name__ == "__main__":
generate_data()