Spaces:
Sleeping
Sleeping
| """Collect multilingual data for the chatbot. | |
| Sources (all open, no login required): | |
| HuggingFace Datasets: | |
| - Helsinki-NLP/opus-100 (configs ar-en, en-fr — streamed) | |
| -> Arabic / English / French monolingual sentences | |
| - AmazonScience/massive (ar-SA, en-US, fr-FR) | |
| -> intent classification (60 classes -> mapped to our 6) | |
| - unimelb-nlp/wikiann (ar, en, fr) | |
| -> NER (PER / LOC / ORG) | |
| Web scraping: | |
| - Wikipedia REST summary API (ar/en/fr) — public, CC-BY-SA, 0.5s delay, | |
| polite User-Agent. Used as a small extra source of language samples. | |
| Skipping commercial customer-support sites (TOS / scraping risk). | |
| Synthetic generation: | |
| - Code-switched sentences (AR+EN, AR+FR, EN+FR, Arabizi+EN) | |
| - Greeting / farewell / complaint examples in AR/EN/FR | |
| (complaint and farewell are absent in MASSIVE -> pure synthetic) | |
| - DATE-tagged NER sentences in 3 languages | |
| (wikiann/conll lack DATE) | |
| - FAQ knowledge base (~80 Q&A pairs across 3 languages) | |
| Outputs (all under data/raw/): | |
| lang_detection_data.csv : columns [text, language] -- AR/EN/FR/CS | |
| intent_data.csv : columns [text, intent, language] | |
| ner_data.csv : columns [tokens, ner_tags, language] | |
| (tokens & ner_tags stored as JSON-encoded lists) | |
| knowledge_base.csv : columns [question, answer, language, topic] | |
| Resilient: if any HF source fails, we log it and continue. Synthetic data | |
| guarantees every CSV has content even with zero internet. | |
| Use --quick to halve dataset sizes for faster smoke-tests. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import random | |
| import sys | |
| import time | |
| import warnings | |
| from collections import Counter | |
| from pathlib import Path | |
| from typing import Any | |
| import pandas as pd | |
| from tqdm import tqdm | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| RAW = PROJECT_ROOT / "data" / "raw" | |
| RAW.mkdir(parents=True, exist_ok=True) | |
| random.seed(42) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| # ============================================================================ | |
| # SECTION 1: SIZE TARGETS | |
| # ============================================================================ | |
| QUICK = False # toggled by --quick | |
| def t(n: int) -> int: | |
| """Return half the target if --quick, else n.""" | |
| return max(50, n // 2) if QUICK else n | |
| # Target sentence counts per language for the LANG-DETECTION dataset. | |
| # These are upper bounds; if HF sources fail, we backfill with synthetic. | |
| def lang_targets() -> dict[str, int]: | |
| return {"AR": t(3000), "EN": t(3000), "FR": t(3000), "CS": t(1500)} | |
| def ner_target_per_lang() -> int: | |
| return t(2000) | |
| def synthetic_cs_count() -> int: | |
| return t(400) # extra synthetic CS sentences (also seed for CS class) | |
| # ============================================================================ | |
| # SECTION 2: HF LOADERS | |
| # ============================================================================ | |
| def hf_load_opus100_monolingual(pair: str, side: str, target: int) -> list[str]: | |
| """Stream Helsinki-NLP/opus-100 and pull `target` clean sentences. | |
| Args: | |
| pair: e.g. "ar-en", "en-fr" | |
| side: which side of the pair to keep, must be one of pair.split("-") | |
| target: number of unique sentences to keep (length-filtered, deduped) | |
| Returns: | |
| List of clean monolingual sentences. May be shorter than target if | |
| the source is exhausted; empty if streaming fails. | |
| """ | |
| try: | |
| from datasets import load_dataset | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] datasets import failed: {exc}") | |
| return [] | |
| print(f" Streaming opus-100 [{pair}] -> '{side}' (target={target}) ...") | |
| try: | |
| ds = load_dataset( | |
| "Helsinki-NLP/opus-100", pair, split="train", | |
| streaming=True, trust_remote_code=True, | |
| ) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] opus-100 {pair} failed to stream: {exc}") | |
| return [] | |
| out: list[str] = [] | |
| seen: set[str] = set() | |
| pbar = tqdm(total=target, desc=f"opus-100/{pair}/{side}", leave=False) | |
| try: | |
| for ex in ds: | |
| if len(out) >= target: | |
| break | |
| sent = (ex.get("translation") or {}).get(side, "").strip() | |
| n_words = len(sent.split()) | |
| if not (4 <= n_words <= 40): | |
| continue | |
| if sent in seen: | |
| continue | |
| seen.add(sent) | |
| out.append(sent) | |
| pbar.update(1) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] opus-100 streaming interrupted: {exc}") | |
| pbar.close() | |
| print(f" -> kept {len(out)} sentences") | |
| return out | |
| # Mapping MASSIVE intent name -> our 6-class scheme. | |
| INTENT_MAPPING: dict[str, set[str]] = { | |
| "booking": { | |
| "takeaway_order", "transport_taxi", "transport_ticket", | |
| "calendar_set", "email_sendemail", "alarm_set", | |
| "lists_createoradd", "iot_coffee", | |
| }, | |
| "inquiry": { | |
| "alarm_query", "calendar_query", "cooking_query", "cooking_recipe", | |
| "datetime_query", "datetime_convert", "email_query", | |
| "email_querycontact", "lists_query", "music_query", | |
| "news_query", "qa_currency", "qa_definition", "qa_factoid", | |
| "qa_maths", "qa_stock", "recommendation_events", | |
| "recommendation_locations", "recommendation_movies", | |
| "social_query", "takeaway_query", "transport_query", | |
| "transport_traffic", "weather_query", "audio_volume_other", | |
| }, | |
| "greeting": {"general_greet"}, | |
| # "complaint" and "farewell" come purely from synthetic generation. | |
| } | |
| def map_massive_intent(name: str) -> str: | |
| """Map MASSIVE intent string to our 6-class label.""" | |
| for cls, names in INTENT_MAPPING.items(): | |
| if name in names: | |
| return cls | |
| return "other" | |
| def hf_load_massive(lang_code: str, hf_lang: str) -> pd.DataFrame: | |
| """Load AmazonScience/massive for one language. | |
| Returns DataFrame with columns [text, intent, language] using OUR | |
| 6-class intent labels. Empty DataFrame on failure. | |
| """ | |
| try: | |
| from datasets import load_dataset | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] datasets import failed: {exc}") | |
| return pd.DataFrame(columns=["text", "intent", "language"]) | |
| print(f" Loading MASSIVE [{hf_lang}] -> {lang_code} ...") | |
| try: | |
| ds = load_dataset( | |
| "AmazonScience/massive", hf_lang, split="train", | |
| trust_remote_code=True, | |
| ) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] MASSIVE {hf_lang} failed: {exc}") | |
| return pd.DataFrame(columns=["text", "intent", "language"]) | |
| try: | |
| intent_names = ds.features["intent"].names # type: ignore[index] | |
| except Exception: | |
| intent_names = None | |
| rows: list[dict[str, Any]] = [] | |
| for ex in ds: | |
| text = (ex.get("utt") or "").strip() | |
| if not text: | |
| continue | |
| ii = ex.get("intent") | |
| name = intent_names[ii] if intent_names and isinstance(ii, int) else str(ii) | |
| rows.append({ | |
| "text": text, | |
| "intent": map_massive_intent(name), | |
| "language": lang_code, | |
| }) | |
| df = pd.DataFrame(rows) | |
| print(f" -> {len(df)} rows (intent dist: {df['intent'].value_counts().to_dict() if len(df) else {}})") | |
| return df | |
| def hf_load_wikiann(lang_code: str, hf_lang: str, target: int) -> pd.DataFrame: | |
| """Load wikiann for one language (NER with PER/LOC/ORG). | |
| Returns DataFrame [tokens, ner_tags, language] (lists kept as Python lists). | |
| Empty DataFrame on failure. | |
| """ | |
| try: | |
| from datasets import load_dataset | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] datasets import failed: {exc}") | |
| return pd.DataFrame(columns=["tokens", "ner_tags", "language"]) | |
| print(f" Loading wikiann [{hf_lang}] -> {lang_code} (target={target}) ...") | |
| try: | |
| ds = load_dataset( | |
| "unimelb-nlp/wikiann", hf_lang, split="train", | |
| trust_remote_code=True, | |
| ) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] wikiann {hf_lang} failed: {exc}") | |
| return pd.DataFrame(columns=["tokens", "ner_tags", "language"]) | |
| try: | |
| if len(ds) > target: | |
| ds = ds.shuffle(seed=42).select(range(target)) | |
| label_names = ds.features["ner_tags"].feature.names # type: ignore[union-attr] | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] wikiann shape unexpected: {exc}") | |
| return pd.DataFrame(columns=["tokens", "ner_tags", "language"]) | |
| rows: list[dict[str, Any]] = [] | |
| for ex in tqdm(ds, desc=f"wikiann/{hf_lang}", leave=False): | |
| tokens = list(ex["tokens"]) | |
| tag_ids = list(ex["ner_tags"]) | |
| if not tokens or len(tokens) != len(tag_ids): | |
| continue | |
| tags = [label_names[i] for i in tag_ids] | |
| rows.append({"tokens": tokens, "ner_tags": tags, "language": lang_code}) | |
| df = pd.DataFrame(rows) | |
| print(f" -> {len(df)} sentences") | |
| return df | |
| # ============================================================================ | |
| # SECTION 3: WIKIPEDIA SCRAPER | |
| # ============================================================================ | |
| def scrape_wikipedia_summaries(n_per_lang: int = 50) -> dict[str, list[str]]: | |
| """Pull random article summaries from Wikipedia REST API in ar/en/fr. | |
| Polite: 0.5s delay between requests, custom User-Agent, short timeout. | |
| Returns {"AR": [...], "EN": [...], "FR": [...]}. | |
| Silently returns empty lists if requests fails / network is blocked. | |
| """ | |
| try: | |
| import requests | |
| except Exception: | |
| return {"AR": [], "EN": [], "FR": []} | |
| endpoints = { | |
| "AR": "https://ar.wikipedia.org/api/rest_v1/page/random/summary", | |
| "EN": "https://en.wikipedia.org/api/rest_v1/page/random/summary", | |
| "FR": "https://fr.wikipedia.org/api/rest_v1/page/random/summary", | |
| } | |
| headers = { | |
| "User-Agent": "MultilingualChatbot/0.1 (research; not for redistribution)", | |
| "Accept": "application/json", | |
| } | |
| out: dict[str, list[str]] = {"AR": [], "EN": [], "FR": []} | |
| for lang, url in endpoints.items(): | |
| seen: set[str] = set() | |
| attempts = 0 | |
| max_attempts = n_per_lang * 3 | |
| with tqdm(total=n_per_lang, desc=f"wiki/{lang}", leave=False) as pbar: | |
| while len(out[lang]) < n_per_lang and attempts < max_attempts: | |
| attempts += 1 | |
| try: | |
| r = requests.get(url, timeout=8, headers=headers) | |
| if r.status_code != 200: | |
| time.sleep(0.5) | |
| continue | |
| data = r.json() | |
| extract = (data.get("extract") or "").strip() | |
| n = len(extract.split()) | |
| if 5 <= n <= 50 and extract not in seen: | |
| seen.add(extract) | |
| out[lang].append(extract) | |
| pbar.update(1) | |
| except Exception: | |
| pass | |
| time.sleep(0.5) | |
| print(f" wiki/{lang}: {len(out[lang])} extracts ({attempts} attempts)") | |
| return out | |
| # ============================================================================ | |
| # SECTION 4: SYNTHETIC DATA GENERATION | |
| # ============================================================================ | |
| # --- Word banks --------------------------------------------------------------- | |
| AR_GREETINGS = [ | |
| "مرحبا", "السلام عليكم", "أهلا", "صباح الخير", "مساء الخير", | |
| "أهلا وسهلا", "كيف حالك", "مرحباً بك", "صباح النور", "مساء النور", | |
| ] | |
| EN_GREETINGS = [ | |
| "hello", "hi there", "good morning", "good evening", | |
| "hi", "hey", "good afternoon", "greetings", "howdy", "what's up", | |
| ] | |
| FR_GREETINGS = [ | |
| "bonjour", "salut", "bonsoir", "coucou", | |
| "salutations", "ravi de vous rencontrer", "comment allez-vous", | |
| "ça va", "enchanté", "bonne journée", | |
| ] | |
| AR_FAREWELLS = [ | |
| "مع السلامة", "وداعا", "إلى اللقاء", "أراك لاحقا", | |
| "تصبح على خير", "في أمان الله", "نهارك سعيد", "سلام", | |
| ] | |
| EN_FAREWELLS = [ | |
| "goodbye", "bye", "see you later", "see you", "take care", | |
| "have a good one", "talk to you later", "farewell", "catch you later", | |
| ] | |
| FR_FAREWELLS = [ | |
| "au revoir", "à bientôt", "à plus tard", "adieu", "salut", | |
| "bonne journée", "à demain", "à la prochaine", "à tout à l'heure", | |
| ] | |
| AR_BOOKING = [ | |
| "أريد حجز رحلة إلى دبي", "احجز لي طاولة في المطعم", | |
| "أحتاج إلى حجز فندق ليومين", "اطلب لي تاكسي إلى المطار", | |
| "أريد طلب وجبة عشاء", "حدد لي موعدا مع الطبيب", | |
| "احجز لي تذكرة قطار غدا", "أحتاج إلى حجز قاعة اجتماعات", | |
| "اطلب لي بيتزا من المطعم القريب", "أريد حجز سيارة لأسبوع", | |
| "احجز لي شقة في القاهرة", "أريد أن أحجز رحلة طيران للأسبوع المقبل", | |
| ] | |
| EN_BOOKING = [ | |
| "I want to book a flight to Paris", | |
| "Please reserve a table for two tonight", | |
| "Can you book me a hotel for the weekend", | |
| "I need a taxi to the airport", | |
| "Order a pizza from the nearest restaurant", | |
| "Schedule a meeting with the team tomorrow", | |
| "Book me a train ticket to Madrid", | |
| "I'd like to reserve a meeting room for an hour", | |
| "Please book a rental car for three days", | |
| "Order a coffee for me from the cafe", | |
| "Reserve seats for the 7pm movie show", | |
| "I want to book a doctor's appointment", | |
| ] | |
| FR_BOOKING = [ | |
| "Je veux réserver un vol pour Paris", | |
| "Pouvez-vous réserver une table pour deux ce soir", | |
| "Réservez-moi un hôtel pour le week-end", | |
| "J'ai besoin d'un taxi pour l'aéroport", | |
| "Commandez une pizza du restaurant le plus proche", | |
| "Planifiez une réunion avec l'équipe demain", | |
| "Réservez-moi un billet de train pour Madrid", | |
| "Je voudrais réserver une salle de réunion pour une heure", | |
| "Réservez une voiture de location pour trois jours", | |
| "Commandez un café pour moi au café", | |
| "Réservez des places pour le film de 19h", | |
| "Je veux prendre rendez-vous chez le médecin", | |
| ] | |
| AR_COMPLAINT = [ | |
| "لدي مشكلة في حسابي", "الخدمة سيئة جدا", | |
| "لم يصل طلبي حتى الآن", "التطبيق لا يعمل بشكل صحيح", | |
| "أريد تقديم شكوى", "تم خصم المبلغ مرتين من بطاقتي", | |
| "المنتج الذي وصلني تالف", "انتظرت ساعة ولم يرد علي أحد", | |
| "الموقع بطيء جدا اليوم", "هذا غير مقبول على الإطلاق", | |
| "الدعم الفني لم يساعدني", "أريد استرداد أموالي", | |
| ] | |
| EN_COMPLAINT = [ | |
| "I have a problem with my account", | |
| "The service is really bad", | |
| "My order has not arrived yet", | |
| "The app is not working properly", | |
| "I want to file a complaint", | |
| "I was charged twice on my card", | |
| "The product I received is damaged", | |
| "I waited an hour and nobody answered", | |
| "The website is very slow today", | |
| "This is completely unacceptable", | |
| "Customer support did not help me at all", | |
| "I want a refund for this order", | |
| ] | |
| FR_COMPLAINT = [ | |
| "J'ai un problème avec mon compte", | |
| "Le service est vraiment mauvais", | |
| "Ma commande n'est toujours pas arrivée", | |
| "L'application ne fonctionne pas correctement", | |
| "Je veux déposer une plainte", | |
| "J'ai été facturé deux fois sur ma carte", | |
| "Le produit que j'ai reçu est endommagé", | |
| "J'ai attendu une heure et personne n'a répondu", | |
| "Le site web est très lent aujourd'hui", | |
| "C'est tout à fait inacceptable", | |
| "Le service client ne m'a pas du tout aidé", | |
| "Je veux un remboursement pour cette commande", | |
| ] | |
| AR_INQUIRY = [ | |
| "ما هي ساعات العمل", "كم تكلفة الاشتراك الشهري", | |
| "هل يمكنني الدفع بالبطاقة", "أين يقع المكتب الرئيسي", | |
| "كم يستغرق التوصيل", "هل تقدمون خدمة الإرجاع", | |
| "ما هي طرق الدفع المتاحة", "كم سعر الباقة الذهبية", | |
| ] | |
| EN_INQUIRY = [ | |
| "What are your business hours", | |
| "How much does the monthly subscription cost", | |
| "Can I pay by credit card", | |
| "Where is the main office located", | |
| "How long does delivery take", | |
| "Do you offer a return policy", | |
| "What payment methods do you accept", | |
| "How much is the premium plan", | |
| ] | |
| FR_INQUIRY = [ | |
| "Quels sont vos horaires d'ouverture", | |
| "Combien coûte l'abonnement mensuel", | |
| "Puis-je payer par carte bancaire", | |
| "Où se trouve le siège social", | |
| "Combien de temps prend la livraison", | |
| "Proposez-vous une politique de retour", | |
| "Quels modes de paiement acceptez-vous", | |
| "Combien coûte la formule premium", | |
| ] | |
| AR_OTHER = [ | |
| "أحب الموسيقى الكلاسيكية", "الجو جميل اليوم", | |
| "أمس ذهبت إلى السينما", "كرة القدم رياضة شعبية", | |
| "القراءة هواية رائعة", "أحب الطعام الإيطالي", | |
| ] | |
| EN_OTHER = [ | |
| "I love classical music", "The weather is nice today", | |
| "Yesterday I went to the cinema", "Football is a popular sport", | |
| "Reading is a great hobby", "I love Italian food", | |
| ] | |
| FR_OTHER = [ | |
| "J'aime la musique classique", "Il fait beau aujourd'hui", | |
| "Hier je suis allé au cinéma", "Le football est un sport populaire", | |
| "La lecture est un excellent passe-temps", "J'adore la cuisine italienne", | |
| ] | |
| def _sample_with_min(pool: list[str], n: int) -> list[str]: | |
| """Sample n items from pool, allowing repeats only if pool is smaller.""" | |
| if n <= len(pool): | |
| return random.sample(pool, n) | |
| out = list(pool) | |
| while len(out) < n: | |
| out.append(random.choice(pool)) | |
| return out | |
| def synthetic_intent_data() -> pd.DataFrame: | |
| """Generate synthetic examples for all 6 intents in AR/EN/FR. | |
| Especially important for `complaint` and `farewell` since MASSIVE has none. | |
| For each (intent, lang) bucket we emit ~80 examples (with light variations). | |
| """ | |
| print(" Generating synthetic intent examples ...") | |
| buckets: list[tuple[str, str, list[str]]] = [ | |
| ("greeting", "AR", AR_GREETINGS), | |
| ("greeting", "EN", EN_GREETINGS), | |
| ("greeting", "FR", FR_GREETINGS), | |
| ("farewell", "AR", AR_FAREWELLS), | |
| ("farewell", "EN", EN_FAREWELLS), | |
| ("farewell", "FR", FR_FAREWELLS), | |
| ("booking", "AR", AR_BOOKING), | |
| ("booking", "EN", EN_BOOKING), | |
| ("booking", "FR", FR_BOOKING), | |
| ("complaint", "AR", AR_COMPLAINT), | |
| ("complaint", "EN", EN_COMPLAINT), | |
| ("complaint", "FR", FR_COMPLAINT), | |
| ("inquiry", "AR", AR_INQUIRY), | |
| ("inquiry", "EN", EN_INQUIRY), | |
| ("inquiry", "FR", FR_INQUIRY), | |
| ("other", "AR", AR_OTHER), | |
| ("other", "EN", EN_OTHER), | |
| ("other", "FR", FR_OTHER), | |
| ] | |
| rows: list[dict[str, str]] = [] | |
| for intent, lang, pool in buckets: | |
| # 80 examples per bucket (or 40 in --quick mode) | |
| n = t(80) | |
| for sent in _sample_with_min(pool, n): | |
| rows.append({"text": sent, "intent": intent, "language": lang}) | |
| df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True) | |
| print(f" -> {len(df)} synthetic intent rows") | |
| return df | |
| # ---- Code-switched generation ------------------------------------------------ | |
| CS_AR_WORDS = ["شكرا", "بكرا", "اليوم", "الرجاء", "كيف حالك", "أحتاج", "مشكلة", | |
| "حساب", "رحلة", "تذكرة", "موعد", "غدا"] | |
| CS_EN_WORDS = ["please", "thank you", "today", "tomorrow", "booking", "account", | |
| "problem", "ticket", "flight", "hotel", "reservation", "help"] | |
| CS_FR_WORDS = ["s'il vous plaît", "merci", "aujourd'hui", "demain", "réservation", | |
| "compte", "problème", "billet", "vol", "hôtel", "rendez-vous"] | |
| ARABIZI_PHRASES = [ | |
| "ana bde", "3andi mochkil", "kifak", "shou ekhbarak", | |
| "wallahi", "ma3leesh", "yalla", "shou hayda", "btehki english", | |
| "fi 3andi reservation", "bdi book", "ma 3refet", | |
| ] | |
| def synthetic_code_switched(n: int) -> list[str]: | |
| """Generate `n` code-switched sentences using realistic mixing patterns.""" | |
| print(f" Generating {n} synthetic code-switched sentences ...") | |
| out: list[str] = [] | |
| patterns = [ | |
| # AR + EN word | |
| lambda: f"{random.choice(AR_BOOKING)} {random.choice(CS_EN_WORDS)}", | |
| # EN + AR word | |
| lambda: f"{random.choice(EN_BOOKING)} {random.choice(CS_AR_WORDS)}", | |
| # FR + EN word | |
| lambda: f"{random.choice(FR_BOOKING)} {random.choice(CS_EN_WORDS)}", | |
| # AR greeting + EN | |
| lambda: f"{random.choice(AR_GREETINGS)} {random.choice(EN_GREETINGS)}", | |
| # EN + AR greeting | |
| lambda: f"{random.choice(EN_GREETINGS)} {random.choice(AR_GREETINGS)}", | |
| # FR + AR | |
| lambda: f"{random.choice(FR_GREETINGS)} {random.choice(AR_GREETINGS)}", | |
| # 3-language mix | |
| lambda: f"{random.choice(AR_GREETINGS)} {random.choice(EN_GREETINGS)} {random.choice(FR_GREETINGS)}", | |
| # Arabizi + EN | |
| lambda: f"{random.choice(ARABIZI_PHRASES)} {random.choice(CS_EN_WORDS)}", | |
| # EN + Arabizi | |
| lambda: f"{random.choice(EN_BOOKING).lower()} {random.choice(ARABIZI_PHRASES)}", | |
| # AR complaint + EN word | |
| lambda: f"{random.choice(AR_COMPLAINT)} {random.choice(CS_EN_WORDS)}", | |
| # EN inquiry + AR | |
| lambda: f"{random.choice(EN_INQUIRY).lower()} {random.choice(CS_AR_WORDS)}", | |
| # FR + EN word + AR word | |
| lambda: f"{random.choice(FR_INQUIRY)} {random.choice(CS_EN_WORDS)} {random.choice(CS_AR_WORDS)}", | |
| ] | |
| seen: set[str] = set() | |
| attempts = 0 | |
| while len(out) < n and attempts < n * 4: | |
| attempts += 1 | |
| sent = random.choice(patterns)().strip() | |
| if sent and sent not in seen: | |
| seen.add(sent) | |
| out.append(sent) | |
| print(f" -> {len(out)} code-switched sentences") | |
| return out | |
| # ---- NER DATE-tagged synthetic ---------------------------------------------- | |
| # A handful of templates per language. Generated tokens follow simple | |
| # whitespace splitting; downstream tokenisation will re-tokenise. | |
| EN_MONTHS = ["January", "February", "March", "April", "May", "June", | |
| "July", "August", "September", "October", "November", "December"] | |
| FR_MONTHS = ["janvier", "février", "mars", "avril", "mai", "juin", | |
| "juillet", "août", "septembre", "octobre", "novembre", "décembre"] | |
| AR_MONTHS = ["يناير", "فبراير", "مارس", "أبريل", "مايو", "يونيو", | |
| "يوليو", "أغسطس", "سبتمبر", "أكتوبر", "نوفمبر", "ديسمبر"] | |
| def _date_tokens_en(day: int, month_idx: int, year: int) -> tuple[list[str], list[str]]: | |
| tokens = [str(day), EN_MONTHS[month_idx], str(year)] | |
| tags = ["B-DATE", "I-DATE", "I-DATE"] | |
| return tokens, tags | |
| def _date_tokens_fr(day: int, month_idx: int, year: int) -> tuple[list[str], list[str]]: | |
| tokens = [str(day), FR_MONTHS[month_idx], str(year)] | |
| tags = ["B-DATE", "I-DATE", "I-DATE"] | |
| return tokens, tags | |
| def _date_tokens_ar(day: int, month_idx: int, year: int) -> tuple[list[str], list[str]]: | |
| tokens = [str(day), AR_MONTHS[month_idx], str(year)] | |
| tags = ["B-DATE", "I-DATE", "I-DATE"] | |
| return tokens, tags | |
| def synthetic_ner_dates(n_per_lang: int = 200) -> pd.DataFrame: | |
| """Build synthetic NER sentences containing DATE entities (and sometimes | |
| PER/LOC/ORG too) for AR/EN/FR. Returns DataFrame [tokens, ner_tags, language]. | |
| """ | |
| print(f" Generating synthetic NER (DATE) — {n_per_lang} per language ...") | |
| n_per_lang = t(n_per_lang) | |
| rows: list[dict[str, Any]] = [] | |
| # English templates: "On {DATE} I will fly to Paris." | |
| en_templates = [ | |
| ("On {D} I will travel to Cairo .", | |
| lambda d: ["On"] + d[0] + ["I", "will", "travel", "to", "Cairo", "."], | |
| lambda d: ["O"] + d[1] + ["O", "O", "O", "O", "B-LOC", "O"]), | |
| ("Meeting with John on {D} at the office .", | |
| lambda d: ["Meeting", "with", "John", "on"] + d[0] + ["at", "the", "office", "."], | |
| lambda d: ["O", "O", "B-PER", "O"] + d[1] + ["O", "O", "O", "O"]), | |
| ("The conference takes place on {D} in London .", | |
| lambda d: ["The", "conference", "takes", "place", "on"] + d[0] + ["in", "London", "."], | |
| lambda d: ["O", "O", "O", "O", "O"] + d[1] + ["O", "B-LOC", "O"]), | |
| ] | |
| fr_templates = [ | |
| ("Le {D} je voyagerai au Caire .", | |
| lambda d: ["Le"] + d[0] + ["je", "voyagerai", "au", "Caire", "."], | |
| lambda d: ["O"] + d[1] + ["O", "O", "O", "B-LOC", "O"]), | |
| ("Réunion avec Jean le {D} au bureau .", | |
| lambda d: ["Réunion", "avec", "Jean", "le"] + d[0] + ["au", "bureau", "."], | |
| lambda d: ["O", "O", "B-PER", "O"] + d[1] + ["O", "O", "O"]), | |
| ("La conférence aura lieu le {D} à Paris .", | |
| lambda d: ["La", "conférence", "aura", "lieu", "le"] + d[0] + ["à", "Paris", "."], | |
| lambda d: ["O", "O", "O", "O", "O"] + d[1] + ["O", "B-LOC", "O"]), | |
| ] | |
| ar_templates = [ | |
| ("في {D} سأسافر إلى القاهرة .", | |
| lambda d: ["في"] + d[0] + ["سأسافر", "إلى", "القاهرة", "."], | |
| lambda d: ["O"] + d[1] + ["O", "O", "B-LOC", "O"]), | |
| ("اجتماع مع أحمد بتاريخ {D} في المكتب .", | |
| lambda d: ["اجتماع", "مع", "أحمد", "بتاريخ"] + d[0] + ["في", "المكتب", "."], | |
| lambda d: ["O", "O", "B-PER", "O"] + d[1] + ["O", "O", "O"]), | |
| ("سيعقد المؤتمر يوم {D} في باريس .", | |
| lambda d: ["سيعقد", "المؤتمر", "يوم"] + d[0] + ["في", "باريس", "."], | |
| lambda d: ["O", "O", "O"] + d[1] + ["O", "B-LOC", "O"]), | |
| ] | |
| def emit(lang: str, templates: list, dt_fn) -> None: | |
| for _ in range(n_per_lang): | |
| day = random.randint(1, 28) | |
| month_idx = random.randint(0, 11) | |
| year = random.randint(2018, 2030) | |
| d_tokens, d_tags = dt_fn(day, month_idx, year) | |
| _, tok_fn, tag_fn = random.choice(templates) | |
| tokens = tok_fn((d_tokens, d_tags)) | |
| tags = tag_fn((d_tokens, d_tags)) | |
| rows.append({"tokens": tokens, "ner_tags": tags, "language": lang}) | |
| emit("EN", en_templates, _date_tokens_en) | |
| emit("FR", fr_templates, _date_tokens_fr) | |
| emit("AR", ar_templates, _date_tokens_ar) | |
| df = pd.DataFrame(rows) | |
| print(f" -> {len(df)} synthetic NER sentences with DATE entities") | |
| return df | |
| # ---- Knowledge base (FAQ) ---------------------------------------------------- | |
| KNOWLEDGE_BASE_RAW: list[dict[str, str]] = [ | |
| # English — booking / travel | |
| {"language": "EN", "topic": "booking", "question": "How can I book a flight?", | |
| "answer": "You can book a flight through our website or mobile app by selecting your destination, dates, and number of passengers."}, | |
| {"language": "EN", "topic": "booking", "question": "Can I cancel my booking?", | |
| "answer": "Yes, you can cancel your booking up to 24 hours before departure for a full refund. Cancellation fees may apply afterwards."}, | |
| {"language": "EN", "topic": "booking", "question": "How do I reserve a hotel room?", | |
| "answer": "To reserve a hotel room, choose your destination, check-in and check-out dates, and the number of guests on the booking page."}, | |
| {"language": "EN", "topic": "billing", "question": "What payment methods do you accept?", | |
| "answer": "We accept credit cards (Visa, Mastercard, Amex), debit cards, PayPal, and bank transfers."}, | |
| {"language": "EN", "topic": "billing", "question": "How do I get a refund?", | |
| "answer": "Refund requests can be submitted through your account dashboard. Refunds typically take 5–10 business days to process."}, | |
| {"language": "EN", "topic": "support", "question": "How can I contact customer support?", | |
| "answer": "You can reach customer support via the in-app chat, by email at support@example.com, or by phone at +1 800 123 4567."}, | |
| {"language": "EN", "topic": "support", "question": "What are your business hours?", | |
| "answer": "Our customer support team is available 24/7. Office hours are 9:00 AM to 6:00 PM local time, Monday to Friday."}, | |
| {"language": "EN", "topic": "account", "question": "How do I reset my password?", | |
| "answer": "Click 'Forgot password' on the login screen, enter your email, and follow the link sent to your inbox to reset your password."}, | |
| {"language": "EN", "topic": "account", "question": "How do I delete my account?", | |
| "answer": "You can delete your account from Settings > Privacy. Deletion is permanent and cannot be undone."}, | |
| {"language": "EN", "topic": "shipping", "question": "How long does delivery take?", | |
| "answer": "Standard delivery takes 3–5 business days. Express delivery takes 1–2 business days for an additional fee."}, | |
| {"language": "EN", "topic": "shipping", "question": "Do you ship internationally?", | |
| "answer": "Yes, we ship to over 100 countries. International delivery typically takes 7–14 business days."}, | |
| {"language": "EN", "topic": "general", "question": "Is there a mobile app?", | |
| "answer": "Yes, our mobile app is available for free on the Apple App Store and Google Play."}, | |
| # French | |
| {"language": "FR", "topic": "booking", "question": "Comment puis-je réserver un vol ?", | |
| "answer": "Vous pouvez réserver un vol via notre site web ou notre application mobile en sélectionnant votre destination, vos dates et le nombre de passagers."}, | |
| {"language": "FR", "topic": "booking", "question": "Puis-je annuler ma réservation ?", | |
| "answer": "Oui, vous pouvez annuler votre réservation jusqu'à 24 heures avant le départ pour un remboursement complet. Des frais peuvent s'appliquer après."}, | |
| {"language": "FR", "topic": "booking", "question": "Comment réserver une chambre d'hôtel ?", | |
| "answer": "Pour réserver une chambre, choisissez la destination, les dates d'arrivée et de départ, et le nombre de personnes sur la page de réservation."}, | |
| {"language": "FR", "topic": "billing", "question": "Quels modes de paiement acceptez-vous ?", | |
| "answer": "Nous acceptons les cartes de crédit (Visa, Mastercard, Amex), les cartes de débit, PayPal et les virements bancaires."}, | |
| {"language": "FR", "topic": "billing", "question": "Comment obtenir un remboursement ?", | |
| "answer": "Les demandes de remboursement peuvent être soumises depuis votre tableau de bord. Le traitement prend généralement 5 à 10 jours ouvrables."}, | |
| {"language": "FR", "topic": "support", "question": "Comment contacter le service client ?", | |
| "answer": "Vous pouvez contacter notre service client via le chat de l'application, par email à support@example.com, ou par téléphone au +33 1 23 45 67 89."}, | |
| {"language": "FR", "topic": "support", "question": "Quels sont vos horaires d'ouverture ?", | |
| "answer": "Notre service client est disponible 24h/24 et 7j/7. Les bureaux sont ouverts du lundi au vendredi de 9h à 18h heure locale."}, | |
| {"language": "FR", "topic": "account", "question": "Comment réinitialiser mon mot de passe ?", | |
| "answer": "Cliquez sur 'Mot de passe oublié' sur l'écran de connexion, entrez votre email et suivez le lien envoyé pour réinitialiser votre mot de passe."}, | |
| {"language": "FR", "topic": "account", "question": "Comment supprimer mon compte ?", | |
| "answer": "Vous pouvez supprimer votre compte depuis Paramètres > Confidentialité. La suppression est définitive."}, | |
| {"language": "FR", "topic": "shipping", "question": "Combien de temps prend la livraison ?", | |
| "answer": "La livraison standard prend 3 à 5 jours ouvrables. La livraison express prend 1 à 2 jours moyennant un supplément."}, | |
| {"language": "FR", "topic": "shipping", "question": "Livrez-vous à l'international ?", | |
| "answer": "Oui, nous livrons dans plus de 100 pays. La livraison internationale prend généralement 7 à 14 jours ouvrables."}, | |
| {"language": "FR", "topic": "general", "question": "Existe-t-il une application mobile ?", | |
| "answer": "Oui, notre application mobile est disponible gratuitement sur l'App Store d'Apple et Google Play."}, | |
| # Arabic | |
| {"language": "AR", "topic": "booking", "question": "كيف يمكنني حجز رحلة طيران؟", | |
| "answer": "يمكنك حجز رحلة طيران عبر موقعنا الإلكتروني أو تطبيقنا المحمول من خلال اختيار وجهتك وتواريخ السفر وعدد الركاب."}, | |
| {"language": "AR", "topic": "booking", "question": "هل يمكنني إلغاء حجزي؟", | |
| "answer": "نعم، يمكنك إلغاء حجزك حتى 24 ساعة قبل الموعد للحصول على استرداد كامل. قد تنطبق رسوم إلغاء بعد ذلك."}, | |
| {"language": "AR", "topic": "booking", "question": "كيف أحجز غرفة فندق؟", | |
| "answer": "لحجز غرفة فندق، اختر الوجهة وتواريخ الوصول والمغادرة وعدد النزلاء من صفحة الحجز."}, | |
| {"language": "AR", "topic": "billing", "question": "ما هي طرق الدفع المتاحة؟", | |
| "answer": "نقبل بطاقات الائتمان (فيزا، ماستركارد، أمريكان إكسبريس)، وبطاقات الخصم، وباي بال، والتحويلات البنكية."}, | |
| {"language": "AR", "topic": "billing", "question": "كيف أحصل على استرداد المبلغ؟", | |
| "answer": "يمكن تقديم طلبات الاسترداد من خلال لوحة تحكم حسابك. تستغرق عملية الاسترداد عادة من 5 إلى 10 أيام عمل."}, | |
| {"language": "AR", "topic": "support", "question": "كيف يمكنني التواصل مع خدمة العملاء؟", | |
| "answer": "يمكنك التواصل مع خدمة العملاء عبر المحادثة داخل التطبيق، أو عبر البريد الإلكتروني support@example.com، أو هاتفيا على +966 11 234 5678."}, | |
| {"language": "AR", "topic": "support", "question": "ما هي ساعات العمل؟", | |
| "answer": "فريق خدمة العملاء متاح على مدار الساعة طوال أيام الأسبوع. ساعات العمل الإدارية من 9 صباحا إلى 6 مساء بالتوقيت المحلي من الإثنين إلى الجمعة."}, | |
| {"language": "AR", "topic": "account", "question": "كيف أعيد تعيين كلمة المرور؟", | |
| "answer": "اضغط على 'نسيت كلمة المرور' في شاشة تسجيل الدخول، ثم أدخل بريدك الإلكتروني واتبع الرابط المرسل لإعادة تعيين كلمة المرور."}, | |
| {"language": "AR", "topic": "account", "question": "كيف أحذف حسابي؟", | |
| "answer": "يمكنك حذف حسابك من الإعدادات > الخصوصية. الحذف نهائي ولا يمكن التراجع عنه."}, | |
| {"language": "AR", "topic": "shipping", "question": "كم يستغرق التوصيل؟", | |
| "answer": "يستغرق التوصيل القياسي من 3 إلى 5 أيام عمل. يستغرق التوصيل السريع من 1 إلى 2 يوم عمل مقابل رسوم إضافية."}, | |
| {"language": "AR", "topic": "shipping", "question": "هل تشحنون دوليا؟", | |
| "answer": "نعم، نشحن إلى أكثر من 100 دولة. يستغرق التوصيل الدولي عادة من 7 إلى 14 يوم عمل."}, | |
| {"language": "AR", "topic": "general", "question": "هل لديكم تطبيق محمول؟", | |
| "answer": "نعم، تطبيقنا المحمول متوفر مجانا على متجر آبل وجوجل بلاي."}, | |
| ] | |
| def build_knowledge_base() -> pd.DataFrame: | |
| """Return the curated FAQ knowledge base as a DataFrame.""" | |
| df = pd.DataFrame(KNOWLEDGE_BASE_RAW) | |
| print(f" Knowledge base: {len(df)} Q&A pairs across {df['language'].nunique()} languages") | |
| return df | |
| # ============================================================================ | |
| # SECTION 5: PIPELINE | |
| # ============================================================================ | |
| def build_lang_detection_dataset(wiki_extracts: dict[str, list[str]]) -> pd.DataFrame: | |
| """Combine HF + wikipedia + synthetic CS into the lang-detection CSV.""" | |
| print("\n--- Lang detection dataset ---") | |
| targets = lang_targets() | |
| parts: list[pd.DataFrame] = [] | |
| # AR & EN from opus-100 ar-en | |
| ar_sents = hf_load_opus100_monolingual("ar-en", "ar", target=targets["AR"]) | |
| en_sents = hf_load_opus100_monolingual("ar-en", "en", target=targets["EN"]) | |
| # FR from opus-100 en-fr (also more EN, but cap at target) | |
| fr_sents = hf_load_opus100_monolingual("en-fr", "fr", target=targets["FR"]) | |
| # Add Wikipedia extracts (small extra signal — 50 each) | |
| ar_sents += wiki_extracts.get("AR", []) | |
| en_sents += wiki_extracts.get("EN", []) | |
| fr_sents += wiki_extracts.get("FR", []) | |
| # Backfill from synthetic if any source came up empty | |
| if not ar_sents: | |
| ar_sents = (AR_GREETINGS + AR_BOOKING + AR_COMPLAINT + AR_INQUIRY + AR_OTHER) * 50 | |
| if not en_sents: | |
| en_sents = (EN_GREETINGS + EN_BOOKING + EN_COMPLAINT + EN_INQUIRY + EN_OTHER) * 50 | |
| if not fr_sents: | |
| fr_sents = (FR_GREETINGS + FR_BOOKING + FR_COMPLAINT + FR_INQUIRY + FR_OTHER) * 50 | |
| # Cap to targets | |
| random.shuffle(ar_sents) | |
| random.shuffle(en_sents) | |
| random.shuffle(fr_sents) | |
| ar_sents = ar_sents[: targets["AR"]] | |
| en_sents = en_sents[: targets["EN"]] | |
| fr_sents = fr_sents[: targets["FR"]] | |
| parts.append(pd.DataFrame({"text": ar_sents, "language": "AR"})) | |
| parts.append(pd.DataFrame({"text": en_sents, "language": "EN"})) | |
| parts.append(pd.DataFrame({"text": fr_sents, "language": "FR"})) | |
| # Code-switched | |
| cs_sents = synthetic_code_switched(targets["CS"] + synthetic_cs_count()) | |
| cs_sents = list(dict.fromkeys(cs_sents))[: targets["CS"]] | |
| parts.append(pd.DataFrame({"text": cs_sents, "language": "CS"})) | |
| df = pd.concat(parts, ignore_index=True).drop_duplicates(subset=["text"]) | |
| df = df.sample(frac=1, random_state=42).reset_index(drop=True) | |
| return df | |
| def build_intent_dataset() -> pd.DataFrame: | |
| """Combine MASSIVE + synthetic into the intent CSV.""" | |
| print("\n--- Intent dataset ---") | |
| parts: list[pd.DataFrame] = [] | |
| for lang_code, hf_lang in [("AR", "ar-SA"), ("EN", "en-US"), ("FR", "fr-FR")]: | |
| df = hf_load_massive(lang_code, hf_lang) | |
| if not df.empty: | |
| parts.append(df) | |
| parts.append(synthetic_intent_data()) | |
| df = pd.concat(parts, ignore_index=True).drop_duplicates(subset=["text", "intent", "language"]) | |
| df = df[df["text"].str.len().between(2, 300)].reset_index(drop=True) | |
| return df | |
| def build_ner_dataset() -> pd.DataFrame: | |
| """Combine wikiann + synthetic-DATE into the NER CSV.""" | |
| print("\n--- NER dataset ---") | |
| parts: list[pd.DataFrame] = [] | |
| for lang_code, hf_lang in [("AR", "ar"), ("EN", "en"), ("FR", "fr")]: | |
| df = hf_load_wikiann(lang_code, hf_lang, target=ner_target_per_lang()) | |
| if not df.empty: | |
| parts.append(df) | |
| parts.append(synthetic_ner_dates(n_per_lang=200)) | |
| df = pd.concat(parts, ignore_index=True).reset_index(drop=True) | |
| # Drop sentences with mismatched lengths (defensive) | |
| df = df[df["tokens"].apply(len) == df["ner_tags"].apply(len)].reset_index(drop=True) | |
| return df | |
| # ============================================================================ | |
| # SECTION 6: SAVE & SUMMARY | |
| # ============================================================================ | |
| def save_lang(df: pd.DataFrame) -> Path: | |
| p = RAW / "lang_detection_data.csv" | |
| df.to_csv(p, index=False) | |
| return p | |
| def save_intent(df: pd.DataFrame) -> Path: | |
| p = RAW / "intent_data.csv" | |
| df.to_csv(p, index=False) | |
| return p | |
| def save_ner(df: pd.DataFrame) -> Path: | |
| """NER columns are lists -> JSON-encode for round-trip-safe CSV storage.""" | |
| p = RAW / "ner_data.csv" | |
| out = df.copy() | |
| out["tokens"] = out["tokens"].apply(json.dumps) | |
| out["ner_tags"] = out["ner_tags"].apply(json.dumps) | |
| out.to_csv(p, index=False) | |
| return p | |
| def save_kb(df: pd.DataFrame) -> Path: | |
| p = RAW / "knowledge_base.csv" | |
| df.to_csv(p, index=False) | |
| return p | |
| def print_summary(lang_df: pd.DataFrame, intent_df: pd.DataFrame, | |
| ner_df: pd.DataFrame, kb_df: pd.DataFrame) -> None: | |
| """Print a clean summary of all four datasets.""" | |
| print("\n" + "=" * 72) | |
| print("COLLECTION SUMMARY") | |
| print("=" * 72) | |
| print(f"\nlang_detection_data.csv rows: {len(lang_df)}") | |
| print(f" language distribution: {lang_df['language'].value_counts().to_dict()}") | |
| print(f"\nintent_data.csv rows: {len(intent_df)}") | |
| print(f" language distribution: {intent_df['language'].value_counts().to_dict()}") | |
| print(f" intent distribution : {intent_df['intent'].value_counts().to_dict()}") | |
| cross = intent_df.groupby(["language", "intent"]).size().unstack(fill_value=0) | |
| print(" intent x language:") | |
| for line in cross.to_string().splitlines(): | |
| print(f" {line}") | |
| print(f"\nner_data.csv rows: {len(ner_df)}") | |
| print(f" language distribution: {ner_df['language'].value_counts().to_dict()}") | |
| # Tag distribution | |
| flat_tags: Counter[str] = Counter() | |
| for tags in ner_df["ner_tags"]: | |
| flat_tags.update(tags) | |
| top = dict(flat_tags.most_common(12)) | |
| print(f" top tag frequencies : {top}") | |
| print(f"\nknowledge_base.csv rows: {len(kb_df)}") | |
| print(f" language distribution: {kb_df['language'].value_counts().to_dict()}") | |
| print(f" topic distribution : {kb_df['topic'].value_counts().to_dict()}") | |
| print() | |
| # ============================================================================ | |
| # MAIN | |
| # ============================================================================ | |
| def main() -> int: | |
| """Run the full collection pipeline.""" | |
| parser = argparse.ArgumentParser(description="Collect chatbot training data.") | |
| parser.add_argument("--quick", action="store_true", | |
| help="Use halved sizes for fast smoke testing.") | |
| parser.add_argument("--no-wiki", action="store_true", | |
| help="Skip the Wikipedia REST API scraper.") | |
| args = parser.parse_args() | |
| global QUICK | |
| QUICK = args.quick | |
| print(f"Multilingual Chatbot — data collection (quick={QUICK})") | |
| print(f"Output dir: {RAW}\n") | |
| # 1. Wikipedia (small, polite) | |
| if args.no_wiki: | |
| wiki_extracts: dict[str, list[str]] = {"AR": [], "EN": [], "FR": []} | |
| print("--- Wikipedia scrape: SKIPPED (--no-wiki) ---") | |
| else: | |
| print("--- Wikipedia REST summaries (50/lang, 0.5s delay) ---") | |
| try: | |
| wiki_extracts = scrape_wikipedia_summaries(n_per_lang=50) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" [WARN] Wikipedia scrape failed entirely: {exc}") | |
| wiki_extracts = {"AR": [], "EN": [], "FR": []} | |
| # 2. Lang detection | |
| try: | |
| lang_df = build_lang_detection_dataset(wiki_extracts) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f"[ERROR] lang detection build failed: {exc}") | |
| lang_df = pd.DataFrame(columns=["text", "language"]) | |
| save_lang(lang_df) | |
| # 3. Intent | |
| try: | |
| intent_df = build_intent_dataset() | |
| except Exception as exc: # noqa: BLE001 | |
| print(f"[ERROR] intent build failed: {exc}") | |
| intent_df = synthetic_intent_data() | |
| save_intent(intent_df) | |
| # 4. NER | |
| try: | |
| ner_df = build_ner_dataset() | |
| except Exception as exc: # noqa: BLE001 | |
| print(f"[ERROR] NER build failed: {exc}") | |
| ner_df = synthetic_ner_dates(n_per_lang=200) | |
| save_ner(ner_df) | |
| # 5. Knowledge base | |
| kb_df = build_knowledge_base() | |
| save_kb(kb_df) | |
| print_summary(lang_df, intent_df, ner_df, kb_df) | |
| print("Files written:") | |
| for p in [RAW / "lang_detection_data.csv", RAW / "intent_data.csv", | |
| RAW / "ner_data.csv", RAW / "knowledge_base.csv"]: | |
| size = p.stat().st_size / 1024 if p.exists() else 0 | |
| print(f" {p} ({size:.1f} KB)") | |
| return 0 | |
| if __name__ == "__main__": | |
| try: | |
| sys.exit(main()) | |
| except KeyboardInterrupt: | |
| print("\nAborted by user.") | |
| sys.exit(130) | |