#!/usr/bin/env python3
"""Build TeacherVoice-shaped FR/AR chat JSONL from Hugging Face sources + seeds.

Exports:
  research/data/language-lesson-fr.jsonl
  research/data/language-lesson-ar.jsonl
  research/data/language-lesson-eval-fr.jsonl  (5% holdout)
  research/data/language-lesson-eval-ar.jsonl

Usage:
  uv run python research/data/build_language_lesson_chat.py
  uv run python research/data/build_language_lesson_chat.py --max-per-source 500 --skip-hub
"""

from __future__ import annotations

import argparse
import json
import random
import re
import sys
from collections.abc import Iterator
from pathlib import Path
from typing import Any, Literal

import yaml

_REPO_ROOT = Path(__file__).resolve().parents[2]
_DATA_DIR = Path(__file__).resolve().parent
if str(_REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(_REPO_ROOT))

from echocoach.prompts import (  # noqa: E402
    system_prompt_for_mode,
    topic_context_block,
)
from echocoach.teacher_voice import _VOICE_USER_SUFFIX  # noqa: E402

VoiceMode = Literal["explain", "lesson"]

MIN_ASSISTANT_CHARS = 40
MAX_ASSISTANT_CHARS = 600
EVAL_HOLDOUT_RATIO = 0.05

DEFAULT_FR_SOURCES = (
    "FrancophonIA/english_french",
    "angeluriot/french_instruct",
    "CohereLabs/aya_dataset",
    "pinzhenchen/alpaca-cleaned-fr",
    "jpacifico/French-Alpaca-dataset-Instruct-110K",
)
DEFAULT_AR_SOURCES = (
    "arbml/CIDAR",
    "ClusterlabAi/InstAr-500k",
    "CohereLabs/aya_dataset",
)

SOURCE_CAPS: dict[str, dict[str, int]] = {
    "FrancophonIA/english_french": {"fr": 4000},
    "angeluriot/french_instruct": {"fr": 8000},
    "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
    "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
    "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": 4000},
    "arbml/CIDAR": {"ar": 8000},
    "ClusterlabAi/InstAr-500k": {"ar": 5000},
}

_INSTAR_GOOD_TASKS = frozenset(
    {
        "Open QA",
        "Extraction and Explanation",
        "Summarization",
        "Classification",
    }
)

_CODE_MARKERS = re.compile(r"```|^\s*def |^\s*class |^\s*import ", re.MULTILINE)
_JSON_START = re.compile(r"^\s*[\{\[]")


def _assistant_ok(text: str) -> bool:
    text = (text or "").strip()
    if len(text) < MIN_ASSISTANT_CHARS or len(text) > MAX_ASSISTANT_CHARS:
        return False
    if _JSON_START.match(text):
        return False
    if _CODE_MARKERS.search(text):
        return False
    if text.count("\n") > 8:
        return False
    return True


def _pick_mode(rng: random.Random, *, topic: str | None) -> VoiceMode:
    if topic and rng.random() < 0.4:
        return "lesson"
    return "explain" if rng.random() < 0.6 else "lesson"


def _wrap_row(
    *,
    language: str,
    mode: VoiceMode,
    user_text: str,
    assistant_text: str,
    topic: str | None = None,
) -> dict[str, Any]:
    system = system_prompt_for_mode(mode, language=language)
    topic_line = topic_context_block(topic, mode)
    if topic_line:
        system = f"{system}\n\n{topic_line}"
    user_body = f"{user_text.strip()}\n\n{_VOICE_USER_SUFFIX}"
    return {
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user_body},
            {"role": "assistant", "content": assistant_text.strip()},
        ]
    }


def _load_seeds(path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
    if not path.is_file():
        return [], []
    raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
    fr_rows: list[dict[str, Any]] = []
    ar_rows: list[dict[str, Any]] = []
    for lang, key in (("fr", "fr"), ("ar", "ar")):
        for item in raw.get(key, []):
            mode = item.get("mode", "explain")
            topic = item.get("topic")
            if topic in (None, "null", ""):
                topic = None
            row = _wrap_row(
                language=lang,
                mode=mode,  # type: ignore[arg-type]
                user_text=str(item["user"]),
                assistant_text=str(item["assistant"]),
                topic=str(topic) if topic else None,
            )
            (fr_rows if key == "fr" else ar_rows).append(row)
    return fr_rows, ar_rows


def _iter_english_french(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    """EN→FR parallel sentences — user asks in English, coach replies in French."""
    from datasets import load_dataset

    ds = load_dataset("FrancophonIA/english_french", split="train", streaming=True)
    count = 0
    for row in ds:
        english = (row.get("english") or "").strip()
        french = (row.get("french") or "").strip()
        if english and _assistant_ok(french):
            user = f"Translate the following to French:\n{english}"
            yield user, french, None
            count += 1
            if count >= max_rows:
                break


def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset("angeluriot/french_instruct", split="train", streaming=True)
    count = 0
    for row in ds:
        messages = row.get("messages") or row.get("conversation")
        if not messages:
            continue
        user_text = ""
        assistant_text = ""
        for msg in messages:
            role = (msg.get("role") or msg.get("from") or "").lower()
            content = (msg.get("content") or msg.get("value") or "").strip()
            if role in ("user", "human"):
                user_text = content
            elif role in ("assistant", "gpt", "bot") and content:
                assistant_text = content
        if user_text and _assistant_ok(assistant_text):
            yield user_text, assistant_text, None
            count += 1
            if count >= max_rows:
                break


def _iter_aya(language_code: str, max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset("CohereLabs/aya_dataset", split="train")
    count = 0
    for row in ds:
        if row.get("language") != language_code:
            continue
        user_text = (row.get("inputs") or "").strip()
        assistant_text = (row.get("targets") or "").strip()
        if user_text and _assistant_ok(assistant_text):
            yield user_text, assistant_text, None
            count += 1
            if count >= max_rows:
                break


def _iter_alpaca_fr(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset("pinzhenchen/alpaca-cleaned-fr", split="train")
    count = 0
    for row in ds:
        instruction = (row.get("instruction") or "").strip()
        inp = (row.get("input") or "").strip()
        output = (row.get("output") or "").strip()
        user_text = f"{instruction}\n{inp}".strip() if inp else instruction
        if user_text and _assistant_ok(output):
            yield user_text, output, None
            count += 1
            if count >= max_rows:
                break


def _iter_french_alpaca_110k(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset(
        "jpacifico/French-Alpaca-dataset-Instruct-110K", split="train", streaming=True
    )
    count = 0
    for row in ds:
        instruction = (row.get("instruction") or "").strip()
        inp = (row.get("input") or "").strip()
        output = (row.get("output") or "").strip()
        user_text = f"{instruction}\n{inp}".strip() if inp else instruction
        if user_text and _assistant_ok(output):
            yield user_text, output, None
            count += 1
            if count >= max_rows:
                break


def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset("arbml/CIDAR", split="train")
    count = 0
    for row in ds:
        instruction = (row.get("instruction") or "").strip()
        inp = (row.get("input") or "").strip()
        output = (row.get("output") or "").strip()
        user_text = f"{instruction}\n{inp}".strip() if inp else instruction
        topic = instruction[:80] if instruction else None
        if user_text and _assistant_ok(output):
            yield user_text, output, topic
            count += 1
            if count >= max_rows:
                break


def _iter_instar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
    from datasets import load_dataset

    ds = load_dataset("ClusterlabAi/InstAr-500k", split="train", streaming=True)
    count = 0
    for row in ds:
        task = row.get("task") or ""
        if task not in _INSTAR_GOOD_TASKS:
            continue
        instruction = (row.get("instruction") or "").strip()
        output = (row.get("output") or "").strip()
        topic = (row.get("topic") or "").strip() or None
        if instruction and _assistant_ok(output):
            yield instruction, output, topic
            count += 1
            if count >= max_rows:
                break


_SOURCE_LOADERS: dict[str, dict[str, Any]] = {
    "FrancophonIA/english_french": {"fr": _iter_english_french},
    "angeluriot/french_instruct": {"fr": _iter_french_instruct},
    "CohereLabs/aya_dataset": {
        "fr": lambda n: _iter_aya("fra", n),
        "ar": lambda n: _iter_aya("arb", n),
    },
    "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
    "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": _iter_french_alpaca_110k},
    "arbml/CIDAR": {"ar": _iter_cidar},
    "ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
}


def _collect_from_source(
    source: str,
    language: str,
    max_rows: int,
    rng: random.Random,
) -> list[dict[str, Any]]:
    loaders = _SOURCE_LOADERS.get(source, {})
    loader = loaders.get(language)
    if loader is None:
        print(f"  skip {source} (no loader for {language})")
        return []
    rows: list[dict[str, Any]] = []
    try:
        for user_text, assistant_text, topic in loader(max_rows):
            mode = _pick_mode(rng, topic=topic)
            rows.append(
                _wrap_row(
                    language=language,
                    mode=mode,
                    user_text=user_text,
                    assistant_text=assistant_text,
                    topic=topic,
                )
            )
    except Exception as exc:
        print(f"  warning: {source} failed for {language}: {exc}")
    return rows


def _split_eval(
    rows: list[dict[str, Any]], rng: random.Random
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
    if len(rows) < 20:
        return rows, []
    shuffled = rows.copy()
    rng.shuffle(shuffled)
    n_eval = max(1, int(len(shuffled) * EVAL_HOLDOUT_RATIO))
    return shuffled[n_eval:], shuffled[:n_eval]


def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as fh:
        for row in rows:
            fh.write(json.dumps(row, ensure_ascii=False) + "\n")


def build_language_datasets(
    *,
    french_sources: tuple[str, ...],
    arabic_sources: tuple[str, ...],
    max_per_source: int,
    seeds_path: Path,
    skip_hub: bool,
    seed: int,
) -> None:
    rng = random.Random(seed)
    fr_rows, ar_rows = _load_seeds(seeds_path)
    print(f"Loaded {len(fr_rows)} FR + {len(ar_rows)} AR seed rows from {seeds_path.name}")

    if not skip_hub:
        for source in french_sources:
            cap = min(max_per_source, SOURCE_CAPS.get(source, {}).get("fr", max_per_source))
            print(f"Fetching FR from {source} (cap={cap})...")
            fr_rows.extend(_collect_from_source(source, "fr", cap, rng))
        for source in arabic_sources:
            cap = min(max_per_source, SOURCE_CAPS.get(source, {}).get("ar", max_per_source))
            print(f"Fetching AR from {source} (cap={cap})...")
            ar_rows.extend(_collect_from_source(source, "ar", cap, rng))

    fr_train, fr_eval = _split_eval(fr_rows, rng)
    ar_train, ar_eval = _split_eval(ar_rows, rng)

    out_fr = _DATA_DIR / "language-lesson-fr.jsonl"
    out_ar = _DATA_DIR / "language-lesson-ar.jsonl"
    eval_fr = _DATA_DIR / "language-lesson-eval-fr.jsonl"
    eval_ar = _DATA_DIR / "language-lesson-eval-ar.jsonl"

    _write_jsonl(out_fr, fr_train)
    _write_jsonl(out_ar, ar_train)
    _write_jsonl(eval_fr, fr_eval)
    _write_jsonl(eval_ar, ar_eval)

    print(
        f"Wrote FR train={len(fr_train)} eval={len(fr_eval)} -> {out_fr.name}, {eval_fr.name}\n"
        f"Wrote AR train={len(ar_train)} eval={len(ar_eval)} -> {out_ar.name}, {eval_ar.name}"
    )


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--french-sources",
        default=",".join(DEFAULT_FR_SOURCES),
        help="Comma-separated Hugging Face dataset ids for French",
    )
    parser.add_argument(
        "--arabic-sources",
        default=",".join(DEFAULT_AR_SOURCES),
        help="Comma-separated Hugging Face dataset ids for Arabic",
    )
    parser.add_argument("--max-per-source", type=int, default=5000)
    parser.add_argument(
        "--custom-seeds",
        type=Path,
        default=_DATA_DIR / "language-lesson-seeds.yaml",
    )
    parser.add_argument(
        "--skip-hub",
        action="store_true",
        help="Only write seed rows (offline / smoke)",
    )
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    french_sources = tuple(s.strip() for s in args.french_sources.split(",") if s.strip())
    arabic_sources = tuple(s.strip() for s in args.arabic_sources.split(",") if s.strip())

    build_language_datasets(
        french_sources=french_sources,
        arabic_sources=arabic_sources,
        max_per_source=args.max_per_source,
        seeds_path=args.custom_seeds,
        skip_hub=args.skip_hub,
        seed=args.seed,
    )


if __name__ == "__main__":
    main()