Spaces:
Running
Running
| """Convert indexed document chunks into a training dataset. | |
| Two output formats: | |
| • text mode : plain-LM continued pretraining ({"text": "..."} per row). | |
| Best for vocabulary/language acquisition (e.g. Creole). | |
| No teacher model required. | |
| • instruction : chat-style (system/user/assistant) where the user asks | |
| for content and the assistant emits the chunk text. | |
| Better for domain Q&A; still no teacher needed — | |
| the chunk content itself is the target. | |
| The pipeline writes JSONL because that's what trl.SFTTrainer ingests | |
| directly. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from typing import Iterable, Literal | |
| DatasetMode = Literal["text", "instruction"] | |
| def build_dataset( | |
| chunks: Iterable[str], | |
| mode: DatasetMode = "text", | |
| system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.", | |
| ) -> list[dict]: | |
| chunks = [c.strip() for c in chunks if c and c.strip()] | |
| if mode == "text": | |
| return [{"text": c} for c in chunks] | |
| if mode == "instruction": | |
| return [ | |
| { | |
| "messages": [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": "Continue the passage in the style of the source material."}, | |
| {"role": "assistant", "content": c}, | |
| ] | |
| } | |
| for c in chunks | |
| ] | |
| raise ValueError(f"Unknown dataset mode: {mode}") | |
| def write_jsonl(rows: list[dict], path: str | Path) -> Path: | |
| path = Path(path) | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as f: | |
| for row in rows: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| return path | |