Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import random | |
| def generate_data(): | |
| categories = { | |
| "english": { | |
| "spam": [ | |
| "Congratulations! You've won a $1000 Walmart gift card. Click here to claim.", | |
| "URGENT: Your account has been compromised. Verify your details now.", | |
| "Get rich quick! Work from home and earn $5000 a week. Join now.", | |
| "Free entry into our $100,000 prize draw. Text WIN to 88888.", | |
| "Investment opportunity: Bitcoin prices are surging. Buy now for 10x returns." | |
| ], | |
| "ham": [ | |
| "Hi John, are we still on for the meeting at 2 PM?", | |
| "Can you please send over the report by end of day?", | |
| "The weather looks great for the weekend hike.", | |
| "Dinner is ready, let me know when you're home.", | |
| "Thanks for the help with the project earlier." | |
| ] | |
| }, | |
| "french": { | |
| "spam": [ | |
| "Félicitations ! Vous avez gagné une carte-cadeau de 1000 €. Cliquez ici.", | |
| "URGENT : Votre compte a été compromis. Vérifiez vos coordonnées maintenant.", | |
| "Devenez riche rapidement ! Travaillez à domicile et gagnez 5000 € par semaine.", | |
| "Entrée gratuite à notre tirage au sort de 100 000 €. Envoyez GAGNER au 88888.", | |
| "Opportunité d'investissement : les prix du Bitcoin s'envolent." | |
| ], | |
| "ham": [ | |
| "Salut Jean, on se voit toujours pour la réunion à 14h ?", | |
| "Peux-tu m'envoyer le rapport avant la fin de la journée ?", | |
| "Il fera beau pour la randonnée ce week-end.", | |
| "Le dîner est prêt, dis-moi quand tu rentres.", | |
| "Merci pour l'aide sur le projet tout à l'heure." | |
| ] | |
| }, | |
| "dutch": { | |
| "spam": [ | |
| "Gefeliciteerd! U heeft een cadeaubon van €1000 gewonnen. Klik hier.", | |
| "DRINGEND: Uw account is gecompromitteerd. Verifieer nu uw gegevens.", | |
| "Word snel rijk! Werk vanuit huis en verdien €5000 per week. Doe nu mee.", | |
| "Gratis deelname aan onze prijstrekking van €100.000. Sms WIN naar 88888.", | |
| "Investeringsmogelijkheid: Bitcoin-prijzen stijgen enorm." | |
| ], | |
| "ham": [ | |
| "Hoi Jan, gaat de vergadering om 14:00 uur nog door?", | |
| "Kun je het rapport voor het einde van de dag opsturen?", | |
| "Het weer ziet er goed uit voor de wandeling dit weekend.", | |
| "Het eten is klaar, laat maar weten wanneer je thuis bent.", | |
| "Bedankt voor de hulp bij het project vanmiddag." | |
| ] | |
| }, | |
| "german": { | |
| "spam": [ | |
| "Herzlichen Glückwunsch! Sie haben einen 1000€ Gutschein gewonnen. Hier klicken.", | |
| "DRINGEND: Ihr Konto wurde kompromittiert. Verifizieren Sie jetzt Ihre Daten.", | |
| "Schnell reich werden! Arbeiten Sie von zu Hause und verdienen Sie 5000€ pro Woche.", | |
| "Kostenlose Teilnahme an unserer 100.000€ Verlosung. Sende GEWINN an 88888.", | |
| "Investmentmöglichkeit: Bitcoin-Preise steigen rasant." | |
| ], | |
| "ham": [ | |
| "Hallo Jan, steht unser Termin um 14 Uhr noch?", | |
| "Kannst du mir den Bericht bitte bis Ende des Tages schicken?", | |
| "Das Wetter sieht gut aus für die Wanderung am Wochenende.", | |
| "Das Abendessen ist fertig, sag Bescheid, wenn du zu Hause bist.", | |
| "Danke für die Hilfe beim Projekt vorhin." | |
| ] | |
| }, | |
| "phishing": { | |
| "spam": [ | |
| "Action Required: Your PayPal account has been restricted. Sign in to fix.", | |
| "Microsoft Security Alert: Unusual sign-in activity detected on your account.", | |
| "Netflix: Your payment was declined. Update your billing information immediately.", | |
| "Your Amazon order #123-456789-0 is delayed. Click to track.", | |
| "Apple ID: Someone tried to sign in to your account from Russia." | |
| ], | |
| "ham": [ | |
| "Your weekly screen time report is ready.", | |
| "Password change confirmation for your account.", | |
| "A new device signed into your Google account.", | |
| "Order confirmation: Thank you for your purchase.", | |
| "Welcome to our newsletter! Please confirm your email." | |
| ] | |
| } | |
| } | |
| all_data = [] | |
| for cat, content in categories.items(): | |
| # 80 spam, 20 ham | |
| spam_samples = [random.choice(content["spam"]) for _ in range(80)] | |
| ham_samples = [random.choice(content["ham"]) for _ in range(20)] | |
| for text in spam_samples: | |
| all_data.append({"text": text, "label": 1, "category": cat}) | |
| for text in ham_samples: | |
| all_data.append({"text": text, "label": 0, "category": cat}) | |
| # Mixed category: take from all others | |
| mixed_data = [] | |
| for _ in range(80): | |
| c = random.choice(list(categories.keys())) | |
| mixed_data.append({"text": random.choice(categories[c]["spam"]), "label": 1, "category": "mixed"}) | |
| for _ in range(20): | |
| c = random.choice(list(categories.keys())) | |
| mixed_data.append({"text": random.choice(categories[c]["ham"]), "label": 0, "category": "mixed"}) | |
| all_data.extend(mixed_data) | |
| df = pd.DataFrame(all_data) | |
| df.to_csv("eval_dataset.csv", index=False) | |
| print("Dataset generated: eval_dataset.csv") | |
| if __name__ == "__main__": | |
| generate_data() | |