Spaces:

HemanM
/

EvoTransformerV11

Running

File size: 1,857 Bytes

638084e

"""Convert indexed document chunks into a training dataset.

Two output formats:
  • text mode      : plain-LM continued pretraining ({"text": "..."} per row).
                     Best for vocabulary/language acquisition (e.g. Creole).
                     No teacher model required.
  • instruction    : chat-style (system/user/assistant) where the user asks
                     for content and the assistant emits the chunk text.
                     Better for domain Q&A; still no teacher needed —
                     the chunk content itself is the target.

The pipeline writes JSONL because that's what trl.SFTTrainer ingests
directly.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Iterable, Literal

DatasetMode = Literal["text", "instruction"]


def build_dataset(
    chunks: Iterable[str],
    mode: DatasetMode = "text",
    system_prompt: str = "You are EvoLLM, fine-tuned on user-provided documents.",
) -> list[dict]:
    chunks = [c.strip() for c in chunks if c and c.strip()]
    if mode == "text":
        return [{"text": c} for c in chunks]
    if mode == "instruction":
        return [
            {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": "Continue the passage in the style of the source material."},
                    {"role": "assistant", "content": c},
                ]
            }
            for c in chunks
        ]
    raise ValueError(f"Unknown dataset mode: {mode}")


def write_jsonl(rows: list[dict], path: str | Path) -> Path:
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for row in rows:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    return path