#!/usr/bin/env python3 """Build TeacherVoice-shaped FR/AR chat JSONL from Hugging Face sources + seeds. Exports: research/data/language-lesson-fr.jsonl research/data/language-lesson-ar.jsonl research/data/language-lesson-eval-fr.jsonl (5% holdout) research/data/language-lesson-eval-ar.jsonl Usage: uv run python research/data/build_language_lesson_chat.py uv run python research/data/build_language_lesson_chat.py --max-per-source 500 --skip-hub """ from __future__ import annotations import argparse import json import random import re import sys from collections.abc import Iterator from pathlib import Path from typing import Any, Literal import yaml _REPO_ROOT = Path(__file__).resolve().parents[2] _DATA_DIR = Path(__file__).resolve().parent if str(_REPO_ROOT) not in sys.path: sys.path.insert(0, str(_REPO_ROOT)) from echocoach.prompts import ( # noqa: E402 system_prompt_for_mode, topic_context_block, ) from echocoach.teacher_voice import _VOICE_USER_SUFFIX # noqa: E402 VoiceMode = Literal["explain", "lesson"] MIN_ASSISTANT_CHARS = 40 MAX_ASSISTANT_CHARS = 600 EVAL_HOLDOUT_RATIO = 0.05 DEFAULT_FR_SOURCES = ( "FrancophonIA/english_french", "angeluriot/french_instruct", "CohereLabs/aya_dataset", "pinzhenchen/alpaca-cleaned-fr", "jpacifico/French-Alpaca-dataset-Instruct-110K", ) DEFAULT_AR_SOURCES = ( "arbml/CIDAR", "ClusterlabAi/InstAr-500k", "CohereLabs/aya_dataset", ) SOURCE_CAPS: dict[str, dict[str, int]] = { "FrancophonIA/english_french": {"fr": 4000}, "angeluriot/french_instruct": {"fr": 8000}, "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000}, "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000}, "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": 4000}, "arbml/CIDAR": {"ar": 8000}, "ClusterlabAi/InstAr-500k": {"ar": 5000}, } _INSTAR_GOOD_TASKS = frozenset( { "Open QA", "Extraction and Explanation", "Summarization", "Classification", } ) _CODE_MARKERS = re.compile(r"```|^\s*def |^\s*class |^\s*import ", re.MULTILINE) _JSON_START = re.compile(r"^\s*[\{\[]") def _assistant_ok(text: str) -> bool: text = (text or "").strip() if len(text) < MIN_ASSISTANT_CHARS or len(text) > MAX_ASSISTANT_CHARS: return False if _JSON_START.match(text): return False if _CODE_MARKERS.search(text): return False if text.count("\n") > 8: return False return True def _pick_mode(rng: random.Random, *, topic: str | None) -> VoiceMode: if topic and rng.random() < 0.4: return "lesson" return "explain" if rng.random() < 0.6 else "lesson" def _wrap_row( *, language: str, mode: VoiceMode, user_text: str, assistant_text: str, topic: str | None = None, ) -> dict[str, Any]: system = system_prompt_for_mode(mode, language=language) topic_line = topic_context_block(topic, mode) if topic_line: system = f"{system}\n\n{topic_line}" user_body = f"{user_text.strip()}\n\n{_VOICE_USER_SUFFIX}" return { "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user_body}, {"role": "assistant", "content": assistant_text.strip()}, ] } def _load_seeds(path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: if not path.is_file(): return [], [] raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {} fr_rows: list[dict[str, Any]] = [] ar_rows: list[dict[str, Any]] = [] for lang, key in (("fr", "fr"), ("ar", "ar")): for item in raw.get(key, []): mode = item.get("mode", "explain") topic = item.get("topic") if topic in (None, "null", ""): topic = None row = _wrap_row( language=lang, mode=mode, # type: ignore[arg-type] user_text=str(item["user"]), assistant_text=str(item["assistant"]), topic=str(topic) if topic else None, ) (fr_rows if key == "fr" else ar_rows).append(row) return fr_rows, ar_rows def _iter_english_french(max_rows: int) -> Iterator[tuple[str, str, str | None]]: """EN→FR parallel sentences — user asks in English, coach replies in French.""" from datasets import load_dataset ds = load_dataset("FrancophonIA/english_french", split="train", streaming=True) count = 0 for row in ds: english = (row.get("english") or "").strip() french = (row.get("french") or "").strip() if english and _assistant_ok(french): user = f"Translate the following to French:\n{english}" yield user, french, None count += 1 if count >= max_rows: break def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset("angeluriot/french_instruct", split="train", streaming=True) count = 0 for row in ds: messages = row.get("messages") or row.get("conversation") if not messages: continue user_text = "" assistant_text = "" for msg in messages: role = (msg.get("role") or msg.get("from") or "").lower() content = (msg.get("content") or msg.get("value") or "").strip() if role in ("user", "human"): user_text = content elif role in ("assistant", "gpt", "bot") and content: assistant_text = content if user_text and _assistant_ok(assistant_text): yield user_text, assistant_text, None count += 1 if count >= max_rows: break def _iter_aya(language_code: str, max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset("CohereLabs/aya_dataset", split="train") count = 0 for row in ds: if row.get("language") != language_code: continue user_text = (row.get("inputs") or "").strip() assistant_text = (row.get("targets") or "").strip() if user_text and _assistant_ok(assistant_text): yield user_text, assistant_text, None count += 1 if count >= max_rows: break def _iter_alpaca_fr(max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset("pinzhenchen/alpaca-cleaned-fr", split="train") count = 0 for row in ds: instruction = (row.get("instruction") or "").strip() inp = (row.get("input") or "").strip() output = (row.get("output") or "").strip() user_text = f"{instruction}\n{inp}".strip() if inp else instruction if user_text and _assistant_ok(output): yield user_text, output, None count += 1 if count >= max_rows: break def _iter_french_alpaca_110k(max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset( "jpacifico/French-Alpaca-dataset-Instruct-110K", split="train", streaming=True ) count = 0 for row in ds: instruction = (row.get("instruction") or "").strip() inp = (row.get("input") or "").strip() output = (row.get("output") or "").strip() user_text = f"{instruction}\n{inp}".strip() if inp else instruction if user_text and _assistant_ok(output): yield user_text, output, None count += 1 if count >= max_rows: break def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset("arbml/CIDAR", split="train") count = 0 for row in ds: instruction = (row.get("instruction") or "").strip() inp = (row.get("input") or "").strip() output = (row.get("output") or "").strip() user_text = f"{instruction}\n{inp}".strip() if inp else instruction topic = instruction[:80] if instruction else None if user_text and _assistant_ok(output): yield user_text, output, topic count += 1 if count >= max_rows: break def _iter_instar(max_rows: int) -> Iterator[tuple[str, str, str | None]]: from datasets import load_dataset ds = load_dataset("ClusterlabAi/InstAr-500k", split="train", streaming=True) count = 0 for row in ds: task = row.get("task") or "" if task not in _INSTAR_GOOD_TASKS: continue instruction = (row.get("instruction") or "").strip() output = (row.get("output") or "").strip() topic = (row.get("topic") or "").strip() or None if instruction and _assistant_ok(output): yield instruction, output, topic count += 1 if count >= max_rows: break _SOURCE_LOADERS: dict[str, dict[str, Any]] = { "FrancophonIA/english_french": {"fr": _iter_english_french}, "angeluriot/french_instruct": {"fr": _iter_french_instruct}, "CohereLabs/aya_dataset": { "fr": lambda n: _iter_aya("fra", n), "ar": lambda n: _iter_aya("arb", n), }, "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr}, "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": _iter_french_alpaca_110k}, "arbml/CIDAR": {"ar": _iter_cidar}, "ClusterlabAi/InstAr-500k": {"ar": _iter_instar}, } def _collect_from_source( source: str, language: str, max_rows: int, rng: random.Random, ) -> list[dict[str, Any]]: loaders = _SOURCE_LOADERS.get(source, {}) loader = loaders.get(language) if loader is None: print(f" skip {source} (no loader for {language})") return [] rows: list[dict[str, Any]] = [] try: for user_text, assistant_text, topic in loader(max_rows): mode = _pick_mode(rng, topic=topic) rows.append( _wrap_row( language=language, mode=mode, user_text=user_text, assistant_text=assistant_text, topic=topic, ) ) except Exception as exc: print(f" warning: {source} failed for {language}: {exc}") return rows def _split_eval( rows: list[dict[str, Any]], rng: random.Random ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: if len(rows) < 20: return rows, [] shuffled = rows.copy() rng.shuffle(shuffled) n_eval = max(1, int(len(shuffled) * EVAL_HOLDOUT_RATIO)) return shuffled[n_eval:], shuffled[:n_eval] def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as fh: for row in rows: fh.write(json.dumps(row, ensure_ascii=False) + "\n") def build_language_datasets( *, french_sources: tuple[str, ...], arabic_sources: tuple[str, ...], max_per_source: int, seeds_path: Path, skip_hub: bool, seed: int, ) -> None: rng = random.Random(seed) fr_rows, ar_rows = _load_seeds(seeds_path) print(f"Loaded {len(fr_rows)} FR + {len(ar_rows)} AR seed rows from {seeds_path.name}") if not skip_hub: for source in french_sources: cap = min(max_per_source, SOURCE_CAPS.get(source, {}).get("fr", max_per_source)) print(f"Fetching FR from {source} (cap={cap})...") fr_rows.extend(_collect_from_source(source, "fr", cap, rng)) for source in arabic_sources: cap = min(max_per_source, SOURCE_CAPS.get(source, {}).get("ar", max_per_source)) print(f"Fetching AR from {source} (cap={cap})...") ar_rows.extend(_collect_from_source(source, "ar", cap, rng)) fr_train, fr_eval = _split_eval(fr_rows, rng) ar_train, ar_eval = _split_eval(ar_rows, rng) out_fr = _DATA_DIR / "language-lesson-fr.jsonl" out_ar = _DATA_DIR / "language-lesson-ar.jsonl" eval_fr = _DATA_DIR / "language-lesson-eval-fr.jsonl" eval_ar = _DATA_DIR / "language-lesson-eval-ar.jsonl" _write_jsonl(out_fr, fr_train) _write_jsonl(out_ar, ar_train) _write_jsonl(eval_fr, fr_eval) _write_jsonl(eval_ar, ar_eval) print( f"Wrote FR train={len(fr_train)} eval={len(fr_eval)} -> {out_fr.name}, {eval_fr.name}\n" f"Wrote AR train={len(ar_train)} eval={len(ar_eval)} -> {out_ar.name}, {eval_ar.name}" ) def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--french-sources", default=",".join(DEFAULT_FR_SOURCES), help="Comma-separated Hugging Face dataset ids for French", ) parser.add_argument( "--arabic-sources", default=",".join(DEFAULT_AR_SOURCES), help="Comma-separated Hugging Face dataset ids for Arabic", ) parser.add_argument("--max-per-source", type=int, default=5000) parser.add_argument( "--custom-seeds", type=Path, default=_DATA_DIR / "language-lesson-seeds.yaml", ) parser.add_argument( "--skip-hub", action="store_true", help="Only write seed rows (offline / smoke)", ) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() french_sources = tuple(s.strip() for s in args.french_sources.split(",") if s.strip()) arabic_sources = tuple(s.strip() for s in args.arabic_sources.split(",") if s.strip()) build_language_datasets( french_sources=french_sources, arabic_sources=arabic_sources, max_per_source=args.max_per_source, seeds_path=args.custom_seeds, skip_hub=args.skip_hub, seed=args.seed, ) if __name__ == "__main__": main()