sage / data /bootstrap.py
sage002's picture
feat: add authenticated remote control UI and ngrok launcher
1e799aa verified
"""Bootstrap small raw corpora for tokenizer and smoke-training flows."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
BOOTSTRAP_CORPORA: dict[str, list[str]] = {
"general_web": [
"Large language models learn by predicting the next token in a sequence, but useful systems depend just as much on data quality as on architecture size.",
"A good training corpus mixes clean prose, documentation, dialogue, and reference material so the model sees multiple ways humans structure information.",
"When you build a local model, start with small smoke runs, measure loss curves, and only then scale sequence length, batch size, and parameter count.",
"The fastest way to waste compute is to train on noisy duplicated text without checking tokenization, filtering, and validation splits first.",
"Evaluation should include both regression tests and qualitative prompts because perplexity alone does not tell you whether a model follows instructions well.",
"A serving stack usually needs checkpoint loading, tokenization, generation settings, and telemetry before it is practical for iterative experiments.",
],
"code": [
"def running_mean(values):\n total = 0.0\n result = []\n for index, value in enumerate(values, start=1):\n total += value\n result.append(total / index)\n return result",
"class TextBatch:\n def __init__(self, items):\n self.items = list(items)\n\n def join(self, sep='\\n'):\n return sep.join(self.items)",
"from pathlib import Path\n\ndef read_text(path):\n return Path(path).read_text(encoding='utf-8')",
"def clamp(value, lo, hi):\n if value < lo:\n return lo\n if value > hi:\n return hi\n return value",
"def format_metrics(step, loss):\n return f'step={step} loss={loss:.4f}'",
"def greedy_decode(logits):\n import torch\n return int(torch.argmax(logits, dim=-1).item())",
],
"math_science": [
"The derivative of x squared is 2x, and gradient-based optimization uses derivatives to decide how to update model parameters.",
"Perplexity is the exponential of average negative log likelihood; lower perplexity means the model assigns higher probability to the observed sequence.",
"If a batch contains B sequences of length T, then the number of next-token predictions is roughly B times T.",
"Matrix multiplication is central to transformer inference because projections for queries, keys, values, and feed-forward layers are all linear maps.",
"Softmax converts raw logits into a probability distribution by exponentiating each value and dividing by the sum of exponentials.",
"The context window bounds how many previous tokens the decoder can attend to while producing the next token.",
],
"multilingual": [
"English: Training data should be filtered, deduplicated, and documented before long runs begin.",
"Hindi: अच्छे मॉडल के लिए साफ और विविध डेटा उतना ही जरूरी है जितना अच्छा आर्किटेक्चर।",
"Arabic: جودة البيانات تؤثر على جودة النموذج بقدر تأثير حجم النموذج نفسه.",
"Chinese: 在开始长时间训练之前,先做小规模验证可以节省大量计算资源。",
"Spanish: Un buen flujo de datos incluye limpieza, deduplicacion y particiones reproducibles.",
"French: Un modele utile demande des donnees propres, des tests et une boucle d'evaluation simple.",
],
"synthetic": [
"[INST] Explain why deduplication matters before tokenizer training. [/INST] Deduplication prevents repeated passages from dominating merge statistics and reduces wasted compute during later model training.",
"[INST] Write a short checklist for a smoke training run. [/INST] Verify shards exist, verify tokenizer loads, run a short job, inspect metrics, and confirm checkpoints are written.",
"[INST] How do you know a dataset is too noisy? [/INST] Look for low alpha ratios, malformed markup, repeated content, excessive boilerplate, or corrupted encoding.",
"[INST] What is the purpose of a validation split? [/INST] It gives you held-out data for tracking generalization and for catching regressions during training.",
"[INST] Summarize the role of the tokenizer. [/INST] The tokenizer maps raw text into stable token ids the model can consume during training and generation.",
"[INST] Why keep metadata with each record? [/INST] Metadata helps audit provenance, quality, language mix, and filtering decisions across the pipeline.",
],
}
def _pad_sample(text: str, minimum_chars: int = 240) -> str:
"""Extend short bootstrap samples so they survive the default filters."""
trailer = (
" This bootstrap record is intentionally longer so the repo's default "
"quality filters keep it during smoke-test data preparation and tokenizer training."
)
padded = text.strip()
while len(padded) < minimum_chars:
padded += trailer
return padded
def bootstrap_raw_corpora(output_dir: str = "data/raw", overwrite: bool = False) -> dict[str, int]:
"""Write one small JSONL corpus file per registered source."""
root = Path(output_dir)
root.mkdir(parents=True, exist_ok=True)
counts: dict[str, int] = {}
for source_name, samples in BOOTSTRAP_CORPORA.items():
path = root / f"{source_name}.jsonl"
if path.exists() and not overwrite:
existing = sum(1 for _ in path.open("r", encoding="utf-8"))
counts[source_name] = existing
continue
with path.open("w", encoding="utf-8") as handle:
for index, text in enumerate(samples, start=1):
payload = {
"id": f"{source_name}-{index:04d}",
"text": _pad_sample(text),
"source_name": source_name,
}
handle.write(json.dumps(payload, ensure_ascii=False) + "\n")
counts[source_name] = len(samples)
return counts
def build_argparser() -> argparse.ArgumentParser:
"""Build the CLI parser for corpus bootstrapping."""
parser = argparse.ArgumentParser(description="Create small JSONL corpora for SAGE smoke runs.")
parser.add_argument("--output-dir", default="data/raw", help="Directory for raw JSONL corpus files.")
parser.add_argument("--overwrite", action="store_true", help="Replace any existing bootstrap corpus files.")
return parser
def main() -> None:
"""CLI entrypoint."""
args = build_argparser().parse_args()
summary = bootstrap_raw_corpora(output_dir=args.output_dir, overwrite=args.overwrite)
print(json.dumps({"output_dir": args.output_dir, "sources": summary}, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()