"""Convert indexed document chunks into a training dataset. Two output formats: • text mode : plain-LM continued pretraining ({"text": "..."} per row). Best for vocabulary/language acquisition (e.g. Creole). No teacher model required. • instruction : chat-style (system/user/assistant) where the user asks for content and the assistant emits the chunk text. Better for domain Q&A; still no teacher needed — the chunk content itself is the target. The pipeline writes JSONL because that's what trl.SFTTrainer ingests directly. """ from __future__ import annotations import json from pathlib import Path from typing import Iterable, Literal DatasetMode = Literal["text", "instruction"] def build_dataset( chunks: Iterable[str], mode: DatasetMode = "text", system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.", ) -> list[dict]: chunks = [c.strip() for c in chunks if c and c.strip()] if mode == "text": return [{"text": c} for c in chunks] if mode == "instruction": return [ { "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": "Continue the passage in the style of the source material."}, {"role": "assistant", "content": c}, ] } for c in chunks ] raise ValueError(f"Unknown dataset mode: {mode}") def write_jsonl(rows: list[dict], path: str | Path) -> Path: path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: for row in rows: f.write(json.dumps(row, ensure_ascii=False) + "\n") return path