Spaces:

HemanM
/

EvoTransformerV11

Running

github-actions

Sync from GitHub @ fa87356

638084e 5 days ago

1.86 kB

	"""Convert indexed document chunks into a training dataset.

	Two output formats:
	• text mode : plain-LM continued pretraining ({"text": "..."} per row).
	Best for vocabulary/language acquisition (e.g. Creole).
	No teacher model required.
	• instruction : chat-style (system/user/assistant) where the user asks
	for content and the assistant emits the chunk text.
	Better for domain Q&A; still no teacher needed —
	the chunk content itself is the target.

	The pipeline writes JSONL because that's what trl.SFTTrainer ingests
	directly.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path
	from typing import Iterable, Literal

	DatasetMode = Literal["text", "instruction"]


	def build_dataset(
	chunks: Iterable[str],
	mode: DatasetMode = "text",
	system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.",
	) -> list[dict]:
	chunks = [c.strip() for c in chunks if c and c.strip()]
	if mode == "text":
	return [{"text": c} for c in chunks]
	if mode == "instruction":
	return [
	{
	"messages": [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": "Continue the passage in the style of the source material."},
	{"role": "assistant", "content": c},
	]
	}
	for c in chunks
	]
	raise ValueError(f"Unknown dataset mode: {mode}")


	def write_jsonl(rows: list[dict], path: str \| Path) -> Path:
	path = Path(path)
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("w", encoding="utf-8") as f:
	for row in rows:
	f.write(json.dumps(row, ensure_ascii=False) + "\n")
	return path