File size: 6,857 Bytes

a216fa7

"""Prepare GLM-5.1-Reasoning (main subset prefix) for the analytic model.

1. build a text corpus sample and train a 16k BPE SentencePiece tokenizer
   (keeps the V x V PMI cooccurrence + SVD feasible, unlike GPT-2's 50k);
2. tokenize every record as   input + "\n\n" + output   with <eos> between
   documents, into train.bin / valid.bin (uint16).
"""
import os, sys, json, time, numpy as np
import sentencepiece as spm

DATA = os.environ.get("ONESHOT_DATA", "/workspace/ts")
SRC = os.path.join(DATA, "main_prefix.jsonl")
CORPUS = os.path.join(DATA, "glm_corpus.txt")
SPM_PREFIX = os.path.join(DATA, "glm16k")
VOCAB = 16384
HF_DATASET = "Jackrong/GLM-5.1-Reasoning-1M-Cleaned"

def log(*a): print(f"[{time.strftime('%H:%M:%S')}]", *a, flush=True)

def first_present(record, names, default=""):
    for name in names:
        if name in record and record[name] is not None:
            return record[name]
    return default

def normalize_record(record):
    inp = first_present(record, ["input", "prompt", "instruction", "question", "query"])
    out = first_present(record, ["output", "response", "answer", "completion"])
    if isinstance(inp, (list, dict)):
        inp = json.dumps(inp, ensure_ascii=False)
    if isinstance(out, (list, dict)):
        out = json.dumps(out, ensure_ascii=False)
    inp = str(inp).strip()
    out = str(out).strip()
    if not inp or not out:
        return None
    return {"input": inp, "output": out}

def download_jsonl(dataset=HF_DATASET, split="train", subset=None, max_records=0):
    from datasets import load_dataset
    kwargs = {"split": split, "streaming": True}
    ds = load_dataset(dataset, subset, **kwargs) if subset else load_dataset(dataset, **kwargs)
    n = 0
    os.makedirs(DATA, exist_ok=True)
    with open(SRC, "w", encoding="utf-8") as out:
        for row in ds:
            rec = normalize_record(row)
            if rec is None:
                continue
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            n += 1
            if n % 10000 == 0:
                log(f"downloaded {n:,} records -> {SRC}")
            if max_records and n >= max_records:
                break
    log(f"download done: {n:,} records -> {SRC}")

def answer_of(r):
    """The actual English response: the <think> reasoning dump is stripped,
    keep the final answer."""
    o = r.get("output", "")
    if "</think>" in o:
        o = o.split("</think>")[-1]
    return o.strip()

def is_english_answer(a):
    """Keep natural-language answers; drop code/math/LaTeX-dominated ones so the
    model learns to answer in plain English (the 'answer English' goal)."""
    if not (40 <= len(a) <= 4000):
        return False
    if "```" in a:                       # code fence
        return False
    alpha = sum(c.isalpha() or c.isspace() for c in a) / len(a)
    if alpha < 0.93:
        return False
    sym = sum(a.count(c) for c in "{}\\$=#|<>_~^")
    if sym / len(a) > 0.02:              # LaTeX / code punctuation density
        return False
    return True

def set_paths(data):
    global DATA, SRC, CORPUS, SPM_PREFIX
    DATA = data
    SRC = os.path.join(DATA, "main_prefix.jsonl")
    CORPUS = os.path.join(DATA, "glm_corpus.txt")
    SPM_PREFIX = os.path.join(DATA, "glm16k")

def build_corpus(max_records=120_000, max_bytes=400_000_000):
    n = 0; b = 0
    with open(SRC, "r", encoding="utf-8", errors="ignore") as f, \
         open(CORPUS, "w", encoding="utf-8") as out:
        for line in f:
            line = line.strip()
            if not line: continue
            try: r = json.loads(line)
            except Exception: continue
            txt = r["input"] + "\n" + r["output"] + "\n"
            out.write(txt); b += len(txt); n += 1
            if n >= max_records or b >= max_bytes: break
    log(f"corpus: {n:,} records, {b/1e6:.1f} MB -> {CORPUS}")

def train_spm():
    spm.SentencePieceTrainer.train(
        input=CORPUS, model_prefix=SPM_PREFIX, vocab_size=VOCAB,
        model_type="bpe", character_coverage=0.9995,
        input_sentence_size=3_000_000, shuffle_input_sentence=True,
        max_sentence_length=100000, num_threads=32,
        unk_id=0, bos_id=1, eos_id=2, pad_id=-1,
        byte_fallback=True,
    )
    log(f"trained SP -> {SPM_PREFIX}.model (vocab={VOCAB})")

def tokenize(val_frac=0.04, english_only=True):
    sp = spm.SentencePieceProcessor(model_file=SPM_PREFIX + ".model")
    eos = sp.eos_id()
    log("scanning + filtering records...")
    docs = []; seen = 0; t0 = time.time()
    with open(SRC, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try: r = json.loads(line)
            except Exception: continue
            seen += 1
            a = answer_of(r)
            if english_only and not is_english_answer(a):
                continue
            docs.append(r["input"].strip() + "\n\n" + a)
    log(f"{seen:,} records -> {len(docs):,} kept "
        f"({100*len(docs)/max(seen,1):.1f}%) english_only={english_only}")
    n_val = int(len(docs) * val_frac)
    splits = {"glm_train.bin": docs[:len(docs) - n_val],
              "glm_valid.bin": docs[len(docs) - n_val:]}
    counts = {}
    for fname, dlist in splits.items():
        nt = 0
        with open(os.path.join(DATA, fname), "wb") as fo:
            for b in range(0, len(dlist), 1000):
                for ids in sp.encode(dlist[b:b + 1000]):
                    arr = np.array(ids + [eos], dtype=np.uint16)
                    arr.tofile(fo); nt += len(arr)
        counts[fname] = nt
    log(f"DONE train={counts['glm_train.bin']:,} tokens, "
        f"valid={counts['glm_valid.bin']:,} tokens ({time.time()-t0:.0f}s)")

if __name__ == "__main__":
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("cmd", nargs="?", default="all",
                    choices=["download", "corpus", "spm", "tok", "all"])
    ap.add_argument("--data", default=DATA)
    ap.add_argument("--src", default=None)
    ap.add_argument("--vocab", type=int, default=VOCAB)
    ap.add_argument("--dataset", default=HF_DATASET)
    ap.add_argument("--subset", default=None)
    ap.add_argument("--split", default="train")
    ap.add_argument("--max_records", type=int, default=0)
    ap.add_argument("--english_only", type=int, default=1)
    ap.add_argument("--val_frac", type=float, default=0.04)
    args = ap.parse_args()
    set_paths(args.data)
    if args.src:
        SRC = args.src
    VOCAB = args.vocab
    cmd = args.cmd
    if cmd in ("download",):
        download_jsonl(args.dataset, args.split, args.subset, args.max_records)
    if cmd in ("corpus", "all"): build_corpus(max_records=args.max_records or 120_000)
    if cmd in ("spm", "all"): train_spm()
    if cmd in ("tok", "all"): tokenize(args.val_frac, bool(args.english_only))