EvoTransformerV11 / knowledge /dataset_builder.py
github-actions
Sync from GitHub @ fa87356
638084e
"""Convert indexed document chunks into a training dataset.
Two output formats:
• text mode : plain-LM continued pretraining ({"text": "..."} per row).
Best for vocabulary/language acquisition (e.g. Creole).
No teacher model required.
• instruction : chat-style (system/user/assistant) where the user asks
for content and the assistant emits the chunk text.
Better for domain Q&A; still no teacher needed —
the chunk content itself is the target.
The pipeline writes JSONL because that's what trl.SFTTrainer ingests
directly.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Iterable, Literal
DatasetMode = Literal["text", "instruction"]
def build_dataset(
chunks: Iterable[str],
mode: DatasetMode = "text",
system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.",
) -> list[dict]:
chunks = [c.strip() for c in chunks if c and c.strip()]
if mode == "text":
return [{"text": c} for c in chunks]
if mode == "instruction":
return [
{
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Continue the passage in the style of the source material."},
{"role": "assistant", "content": c},
]
}
for c in chunks
]
raise ValueError(f"Unknown dataset mode: {mode}")
def write_jsonl(rows: list[dict], path: str | Path) -> Path:
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
return path