Spaces:
Running
Running
File size: 1,857 Bytes
638084e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | """Convert indexed document chunks into a training dataset.
Two output formats:
• text mode : plain-LM continued pretraining ({"text": "..."} per row).
Best for vocabulary/language acquisition (e.g. Creole).
No teacher model required.
• instruction : chat-style (system/user/assistant) where the user asks
for content and the assistant emits the chunk text.
Better for domain Q&A; still no teacher needed —
the chunk content itself is the target.
The pipeline writes JSONL because that's what trl.SFTTrainer ingests
directly.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Iterable, Literal
DatasetMode = Literal["text", "instruction"]
def build_dataset(
chunks: Iterable[str],
mode: DatasetMode = "text",
system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.",
) -> list[dict]:
chunks = [c.strip() for c in chunks if c and c.strip()]
if mode == "text":
return [{"text": c} for c in chunks]
if mode == "instruction":
return [
{
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": "Continue the passage in the style of the source material."},
{"role": "assistant", "content": c},
]
}
for c in chunks
]
raise ValueError(f"Unknown dataset mode: {mode}")
def write_jsonl(rows: list[dict], path: str | Path) -> Path:
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
return path
|