Spaces:

Roman190928
/

pt

Paused

App Files Files Community

pt / v2

Roman190928

Create v2

d187d2d verified 4 months ago

raw

history blame contribute delete

26.4 kB


	# train_babygpt_optimized.py
	"""
	Optimized training script for small GPT on GPU (Tesla T4) with:
	- FP16 mixed precision (autocast + GradScaler) when CUDA is available
	- pinned memory + non_blocking transfers
	- optional dataset pre-tokenization -> tokens.pt
	- torch.compile try/except (won't break saving/loading)
	- minimal GC, set_to_none=True for zero_grad
	- gradient-checkpointing opt-in (disabled by default for speed)
	- safer checkpoint saving (saved as float32 on CPU)
	- robust autocast context that handles older/newer PyTorch signatures
	"""
	import os
	import io
	import time
	import math
	import gc
	import traceback
	from typing import Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint

	torch.backends.cudnn.benchmark = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cuda.matmul.allow_tf32 = True

	# CPU thread settings (safe)
	try:
	ncpu = os.cpu_count() or 2
	torch.set_num_threads(ncpu)
	torch.set_num_interop_threads(max(1, ncpu // 2))
	except Exception:
	pass

	try:
	import sentencepiece as spm
	except ImportError:
	raise RuntimeError("Please install sentencepiece: pip install sentencepiece")

	try:
	import matplotlib.pyplot as plt
	except Exception:
	plt = None

	# ---------- Paths / defaults ----------
	DATA_PATH = "worldsim.txt"
	SP_MODEL_PREFIX = "tokenizer"
	SP_MODEL_FILE = f"{SP_MODEL_PREFIX}.model"
	SP_VOCAB_DEFAULT = 14000
	TOKENS_PT = "tokens.pt" # pre-tokenized cached file

	# Device default: set to 'cuda' to use the GPU; will auto-check availability
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# GPU-optimized defaults (tuned for T4: speed + reasonable model capacity)
	DEFAULTS = dict(
	BLOCK_SIZE=512, # shorter sequence = more steps/sec
	BATCH_SIZE=8, # bigger batch to saturate GPU (tune down if OOM)
	EMBED_DIM=448, # embed size
	NUM_HEADS=7,
	NUM_LAYERS=10, # fewer layers => much faster
	DROPOUT=0.05,
	EPOCHS=6,
	LR=2e-5,
	PRINT_EVERY=50,
	ACCUM_STEPS=1, # don't accumulate unless needed
	SP_VOCAB=SP_VOCAB_DEFAULT,
	GRADIENT_CHECKPOINT=True, # default off for speed on T4
	PRETOKENIZE=True, # create tokens.pt to avoid tokenization overhead during training
	)

	# ---------- stop flag helpers ----------
	_TRAIN_STOP_REQUESTED = False
	def request_stop():
	global _TRAIN_STOP_REQUESTED; _TRAIN_STOP_REQUESTED = True
	def clear_stop_request():
	global _TRAIN_STOP_REQUESTED; _TRAIN_STOP_REQUESTED = False
	def stop_requested():
	return _TRAIN_STOP_REQUESTED

	# ---------- sentencepiece helpers ----------
	def ensure_sp_model(data_path=DATA_PATH, model_prefix=SP_MODEL_PREFIX, vocab_size=SP_VOCAB_DEFAULT):
	if os.path.exists(f"{model_prefix}.model") and os.path.exists(f"{model_prefix}.vocab"):
	sp = spm.SentencePieceProcessor()
	sp.load(f"{model_prefix}.model")
	return sp
	# train sentencepiece
	spm.SentencePieceTrainer.train(
	input=data_path,
	model_prefix=model_prefix,
	vocab_size=vocab_size,
	model_type="bpe",
	character_coverage=1.0,
	unk_id=0, bos_id=-1, eos_id=-1,
	)
	sp = spm.SentencePieceProcessor()
	sp.load(f"{model_prefix}.model")
	return sp

	# ---------- model ----------
	class BabyGPT(nn.Module):
	def __init__(self, vocab_size, embed_dim, block_size, num_heads, num_layers, dropout, use_checkpoint=False):
	super().__init__()
	self.block_size = block_size
	self.tok_emb = nn.Embedding(vocab_size, embed_dim)
	self.pos_emb = nn.Embedding(block_size, embed_dim)
	self.layers = nn.ModuleList([
	nn.TransformerEncoderLayer(
	d_model=embed_dim, nhead=num_heads, dim_feedforward=embed_dim * 4,
	dropout=dropout, batch_first=True
	) for _ in range(num_layers)
	])
	self.ln = nn.LayerNorm(embed_dim)
	self.head = nn.Linear(embed_dim, vocab_size)
	self._use_gradient_checkpointing = use_checkpoint

	def enable_gradient_checkpointing(self):
	self._use_gradient_checkpointing = True

	def disable_gradient_checkpointing(self):
	self._use_gradient_checkpointing = False

	def forward(self, idx, targets=None):
	B, T = idx.shape
	device = idx.device
	pos = torch.arange(0, T, device=device).unsqueeze(0)
	x = self.tok_emb(idx) + self.pos_emb(pos)
	for layer in self.layers:
	if self._use_gradient_checkpointing and self.training:
	# must pass tensors only; explicit use_reentrant for PyTorch >=2.5 compatibility
	def run_layer(x_local, layer_local=layer):
	return layer_local(x_local)
	# pass use_reentrant explicitly to avoid PyTorch warning in 2.5+
	x = checkpoint.checkpoint(run_layer, x, use_reentrant=False)
	else:
	x = layer(x)
	x = self.ln(x)
	logits = self.head(x)
	loss = None
	if targets is not None:
	loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
	return logits, loss

	@torch.no_grad()
	def generate(self, idx, max_new_tokens=50, temperature=1.0, top_k=50):
	self.eval()
	device = next(self.parameters()).device
	idx = idx.to(device)
	for _ in range(max_new_tokens):
	idx_cond = idx[:, -self.block_size:].to(device)
	logits, _ = self(idx_cond)
	last_logits = logits[:, -1, :].to(torch.float32)
	last_logits = last_logits / (temperature if temperature > 0 else 1.0)
	if top_k > 0:
	v, _ = torch.topk(last_logits, top_k)
	last_logits[last_logits < v[:, [-1]]] = -float("Inf")
	probs = F.softmax(last_logits, dim=-1)
	next_id = torch.multinomial(probs, num_samples=1)
	idx = torch.cat((idx, next_id.to(idx.device)), dim=1)
	return idx

	# ---------- data streaming & token caching ----------
	def stream_text_chunks(path=DATA_PATH, chunk_size=128_000):
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	while True:
	chunk = f.read(chunk_size)
	if not chunk:
	break
	yield chunk

	def build_or_load_token_cache(sp, path=DATA_PATH, tokens_pt=TOKENS_PT, chunk_size=128_000):
	"""
	If tokens.pt exists, load it. Otherwise tokenize the dataset (streaming) and save
	a 1-D tensor of token ids to tokens.pt for fast subsequent training runs.
	"""
	if os.path.exists(tokens_pt):
	try:
	tokens = torch.load(tokens_pt, map_location="cpu")
	if isinstance(tokens, torch.Tensor):
	return tokens
	except Exception:
	pass

	print("Pre-tokenizing dataset to", tokens_pt, " — this may take a while (one-time).")
	all_ids = []
	for chunk in stream_text_chunks(path, chunk_size):
	ids = sp.encode(chunk, out_type=int)
	all_ids.extend(ids)
	tokens = torch.tensor(all_ids, dtype=torch.long)
	torch.save(tokens, tokens_pt)
	return tokens

	def get_batch_from_ids(ids, block_size, batch_size, device="cpu"):
	"""
	ids: 1D list or 1D torch.Tensor of token ids.
	Returns (x, y) on target device with pinned-memory & non_blocking copy when using CUDA.
	"""
	dev = torch.device(device)
	use_cuda = (dev.type == "cuda")
	if isinstance(ids, torch.Tensor):
	L = ids.numel()
	else:
	L = len(ids)
	if L < block_size + 1:
	raise ValueError("ids too short for block size")

	ix = torch.randint(0, L - block_size - 1, (batch_size,))
	# allocate pinned buffers if sending to CUDA
	if use_cuda:
	x = torch.empty((batch_size, block_size), dtype=torch.long).pin_memory()
	y = torch.empty((batch_size, block_size), dtype=torch.long).pin_memory()
	else:
	x = torch.empty((batch_size, block_size), dtype=torch.long)
	y = torch.empty((batch_size, block_size), dtype=torch.long)

	for bi, i in enumerate(ix):
	if torch.is_tensor(ids):
	seg = ids[i:i + block_size + 1].tolist()
	else:
	seg = ids[i:i + block_size + 1]
	x[bi].copy_(torch.tensor(seg[:-1], dtype=torch.long))
	y[bi].copy_(torch.tensor(seg[1:], dtype=torch.long))

	if use_cuda:
	return x.to(dev, non_blocking=True), y.to(dev, non_blocking=True)
	else:
	return x.to(dev), y.to(dev)

	# ---------- plotting ----------
	def plot_loss(history):
	if not plt:
	return b""
	plt.switch_backend("Agg")
	fig, ax = plt.subplots(figsize=(5,3))
	ax.plot(history)
	ax.set_title("Loss")
	buf = io.BytesIO()
	fig.savefig(buf, format="png")
	buf.seek(0)
	plt.close(fig)
	return buf.read()

	# ---------- checkpoint utils ----------
	def _unwrap_model_for_saving(model: nn.Module) -> nn.Module:
	"""Return underlying module if a compiled wrapper added attributes like _orig_mod."""
	if hasattr(model, "_orig_mod"):
	return model._orig_mod
	# some wrappers use .module
	if hasattr(model, "module"):
	return model.module
	return model

	def save_checkpoint(model, cfg, path):
	# save CPU float32 weights to ensure cross-device loading compatibility
	try:
	real_model = _unwrap_model_for_saving(model)
	state = real_model.state_dict()
	cpu_state = {k: v.detach().cpu().to(torch.float32) for k, v in state.items()}
	data = {'model_state_dict': cpu_state, 'config': cfg}
	torch.save(data, path)
	except Exception as e:
	# fallback: try normal save (best effort)
	try:
	torch.save({'model_state_dict': model.state_dict(), 'config': cfg}, path)
	except Exception as e2:
	print("Failed to save checkpoint:", e2)
	raise

	def latest_checkpoint():
	ckpts = [f for f in os.listdir(".") if f.startswith("baby_gpt_epoch") and f.endswith(".pth")]
	if ckpts:
	return sorted(ckpts, key=os.path.getmtime)[-1]
	return "baby_gpt_final.pth" if os.path.exists("baby_gpt_final.pth") else None

	def _strip_orig_mod_prefix(state_dict: dict) -> dict:
	"""Strip leading _orig_mod. prefix from keys if present."""
	keys = list(state_dict.keys())
	# detect if keys have the prefix
	if any(k.startswith("_orig_mod.") for k in keys):
	return {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
	# also support accidental 'module.' prefix
	if any(k.startswith("module.") for k in keys) and not any(k.startswith("tok_emb") for k in keys):
	return {k.replace("module.", ""): v for k, v in state_dict.items()}
	return state_dict

	def load_model_for_inference(checkpoint=None, device=DEVICE):
	ck = checkpoint or latest_checkpoint()
	if not ck:
	raise FileNotFoundError("No checkpoint.")
	data = torch.load(ck, map_location="cpu")
	sp = ensure_sp_model(DATA_PATH, SP_MODEL_PREFIX, DEFAULTS["SP_VOCAB"])
	cfg = data.get("config", DEFAULTS)
	model = BabyGPT(sp.get_piece_size(), cfg["EMBED_DIM"], cfg["BLOCK_SIZE"],
	cfg["NUM_HEADS"], cfg["NUM_LAYERS"], cfg["DROPOUT"],
	use_checkpoint=cfg.get("GRADIENT_CHECKPOINT", False)).to(device)
	state = data["model_state_dict"]
	state = _strip_orig_mod_prefix(state)
	# try load, try forgiving missing/unexpected keys by strict=False first, then stricter load
	try:
	model.load_state_dict(state, strict=False)
	except Exception as e:
	# final fallback: try to match only exact keys
	try:
	model.load_state_dict(state)
	except Exception as e2:
	raise RuntimeError(f"Failed to load state_dict: {e2}")
	model.eval()
	return model, sp, cfg

	# ---------- autocast helper ----------
	def autocast_ctx():
	"""
	Return a context manager for autocast. Robust to PyTorch versions which may accept
	different signatures for torch.cuda.amp.autocast.
	"""
	if not (torch.cuda.is_available() and hasattr(torch.cuda, "amp")):
	# dummy context
	class _NopCtx:
	def __enter__(self): return None
	def __exit__(self, exc_type, exc, tb): return False
	return _NopCtx()
	# try modern signature with dtype
	try:
	return torch.cuda.amp.autocast(dtype=torch.float16)
	except TypeError:
	# older signatures may accept no args or device_type
	try:
	return torch.cuda.amp.autocast()
	except Exception:
	class _NopCtx:
	def __enter__(self): return None
	def __exit__(self, exc_type, exc, tb): return False
	return _NopCtx()

	# ---------- train generator ----------
	def train_generator(**params):
	"""
	Yields (log_text, plot_png_bytes) pairs (same behavior as your Gradio app expects).
	Accepts overrides for any DEFAULTS keys by passing them into train_generator(...).
	"""
	clear_stop_request()
	cfg = {DEFAULTS, {k: v for k, v in params.items() if v is not None}}

	# SentencePiece
	sp = ensure_sp_model(DATA_PATH, SP_MODEL_PREFIX, cfg.get("SP_VOCAB", SP_VOCAB_DEFAULT))

	# Optional pre-tokenize
	tokens_cache = None
	if cfg.get("PRETOKENIZE", True):
	tokens_cache = build_or_load_token_cache(sp, DATA_PATH, TOKENS_PT, chunk_size=128_000)

	# Build model (vocab from sp)
	vocab_size = sp.get_piece_size()
	model = BabyGPT(vocab_size, cfg["EMBED_DIM"], cfg["BLOCK_SIZE"],
	cfg["NUM_HEADS"], cfg["NUM_LAYERS"], cfg["DROPOUT"],
	use_checkpoint=cfg.get("GRADIENT_CHECKPOINT", False))

	# gradient checkpointing opt-in
	if cfg.get("GRADIENT_CHECKPOINT", False):
	try:
	model.enable_gradient_checkpointing()
	print("Gradient checkpointing enabled.")
	except Exception as e:
	print("Could not enable gradient checkpointing:", e)

	# Choose dtype/device strategy
	# On CUDA: keep params float32 and use FP16 autocast + GradScaler (T4 prefers FP16).
	chosen_dtype = torch.float32
	if DEVICE.startswith("cuda") and torch.cuda.is_available():
	model = model.to(torch.float32) # keep params float32
	chosen_dtype = torch.float32
	print("Using float32 params on CUDA. Mixed FP16 autocast will be used during forward.")
	else:
	# On CPU, try bfloat16 if available (rare)
	try:
	_ = torch.empty(1, dtype=torch.bfloat16)
	model = model.to(dtype=torch.bfloat16)
	chosen_dtype = torch.bfloat16
	print("Using bfloat16 on CPU.")
	except Exception:
	model = model.to(dtype=torch.float32)
	chosen_dtype = torch.float32
	print("Using float32 on CPU.")

	# Move model to device
	model = model.to(DEVICE)
	try:
	dev = next(model.parameters()).device
	device_info = str(dev)
	if "cuda" in device_info:
	dev_idx = torch.cuda.current_device()
	dev_name = torch.cuda.get_device_name(dev_idx)
	mem_alloc = torch.cuda.memory_allocated(dev_idx) / 1024**2
	mem_reserved = torch.cuda.memory_reserved(dev_idx) / 1024**2
	device_info = f"{device_info} ({dev_name}) alloc={mem_alloc:.1f}MB reserved={mem_reserved:.1f}MB"
	print("Model moved to device:", device_info)
	except Exception:
	pass

	# Build optimizer
	opt = torch.optim.AdamW(model.parameters(), lr=cfg["LR"])

	# CUDA-specific optimizations
	use_amp = (DEVICE.startswith("cuda") and torch.cuda.is_available())
	scaler = torch.cuda.amp.GradScaler() if use_amp else None
	if use_amp:
	torch.backends.cudnn.benchmark = True
	# optional TF32 speedup
	try:
	torch.backends.cuda.matmul.allow_tf32 = True
	except Exception:
	pass

	# Try torch.compile for speedups (best effort)
	try:
	model = torch.compile(model, mode="reduce-overhead")
	print("torch.compile applied.")
	torch.cuda.synchronize()
	t_start = time.time()
	last_print_time = t_start
	steps_since = 0

	except Exception as e:
	print("torch.compile unavailable:", e)

	# training prep
	loss_hist = []
	total_params_m = sum(p.numel() for p in model.parameters())/1e6
	log = f"🚀 Training started \| {total_params_m:.2f}M params \| dtype={chosen_dtype}\n"
	yield (log, plot_loss(loss_hist))

	model.train()
	global_step = 0

	# timing
	t_start = time.time()
	last_print_time = t_start
	steps_since = 0

	# read ids source: prefer pre-tokenized cache
	ids_source = tokens_cache if tokens_cache is not None else None

	for epoch in range(cfg["EPOCHS"]):
	log += f"\n=== Epoch {epoch+1}/{cfg['EPOCHS']} ===\n"
	# If we have tokens cache: iterate over chunks of the token stream
	if ids_source is not None:
	L = ids_source.numel()
	chunk_length = cfg["BLOCK_SIZE"] * max(1, cfg["BATCH_SIZE"]) * 2
	num_chunks = max(1, L // chunk_length)
	for ci in range(num_chunks):
	if stop_requested(): return
	start = (ci * chunk_length) % max(1, L - chunk_length)
	chunk_ids = ids_source[start:start + chunk_length].tolist()
	steps = max(1, len(chunk_ids) // (cfg["BLOCK_SIZE"] * max(1, cfg["BATCH_SIZE"])))
	for step in range(steps):
	if stop_requested(): return
	xb, yb = get_batch_from_ids(chunk_ids, cfg["BLOCK_SIZE"], cfg["BATCH_SIZE"], device=DEVICE)
	# forward/backward with AMP if available
	try:
	if use_amp:
	with autocast_ctx():
	logits, loss = model(xb, yb)
	scaler.scale(loss / cfg["ACCUM_STEPS"]).backward()
	else:
	logits, loss = model(xb, yb)
	(loss / cfg["ACCUM_STEPS"]).backward()
	except RuntimeError as e:
	# handle occasional OOM gracefully by reducing batch
	if "out of memory" in str(e).lower():
	torch.cuda.empty_cache()
	print("OOM during training step - skipping step (reduce BATCH_SIZE).")
	continue
	else:
	raise

	if (step + 1) % cfg["ACCUM_STEPS"] == 0:
	if use_amp:
	try:
	scaler.step(opt)
	scaler.update()
	except Exception as e:
	print("Scaler step failed:", e)
	opt.step()
	else:
	opt.step()
	opt.zero_grad(set_to_none=True)

	loss_hist.append(loss.item())
	global_step += 1
	steps_since += 1

	if step % cfg["PRINT_EVERY"] == 0:
	torch.cuda.synchronize()
	now = time.time()
	elapsed = now - last_print_time
	overall_elapsed = now - t_start
	sps = steps_since / elapsed if elapsed > 0 else 0.0
	avg_sps = global_step / overall_elapsed if overall_elapsed > 0 else 0.0
	# device stats
	dev_stats = ""
	try:
	dev = next(model.parameters()).device
	if "cuda" in str(dev):
	dev_idx = torch.cuda.current_device()
	mem_alloc = torch.cuda.memory_allocated(dev_idx) / 1024**2
	mem_reserved = torch.cuda.memory_reserved(dev_idx) / 1024**2
	dev_name = torch.cuda.get_device_name(dev_idx)
	dev_stats = f" \| {dev_name} alloc={mem_alloc:.1f}MB reserved={mem_reserved:.1f}MB"
	else:
	dev_stats = f" \| device={dev}"
	except Exception:
	dev_stats = ""
	log += (f"[E{epoch+1}C{ci}] step {step}/{steps} loss={loss.item():.4f} "
	f"\| steps/s={sps:.2f} (avg {avg_sps:.2f}){dev_stats}\n")
	last_print_time = now
	steps_since = 0
	yield (log, plot_loss(loss_hist))

	# free references but avoid overusing gc.collect()
	del xb, yb, logits, loss
	# end steps loop
	# end chunk loop
	else:
	# tokenization-on-the-fly (slower) - fallback to streaming chunks
	for ci, chunk in enumerate(stream_text_chunks(DATA_PATH, chunk_size=128_000)):
	ids = sp.encode(chunk, out_type=int)
	if len(ids) < cfg["BLOCK_SIZE"]:
	continue
	steps = min(1000, max(1, len(ids)//(cfg["BLOCK_SIZE"] * max(1, cfg["BATCH_SIZE"]))))
	for step in range(steps):
	if stop_requested(): return
	xb, yb = get_batch_from_ids(ids, cfg["BLOCK_SIZE"], cfg["BATCH_SIZE"], device=DEVICE)
	try:
	if use_amp:
	with autocast_ctx():
	logits, loss = model(xb, yb)
	scaler.scale(loss / cfg["ACCUM_STEPS"]).backward()
	else:
	logits, loss = model(xb, yb)
	(loss / cfg["ACCUM_STEPS"]).backward()
	except RuntimeError as e:
	if "out of memory" in str(e).lower():
	torch.cuda.empty_cache()
	print("OOM during training step - skipping step (reduce BATCH_SIZE).")
	continue
	else:
	raise

	if (step + 1) % cfg["ACCUM_STEPS"] == 0:
	if use_amp:
	try:
	scaler.step(opt)
	scaler.update()
	except Exception as e:
	print("Scaler step failed:", e)
	opt.step()
	else:
	opt.step()
	opt.zero_grad(set_to_none=True)

	loss_hist.append(loss.item())
	global_step += 1
	steps_since += 1

	if step % cfg["PRINT_EVERY"] == 0:
	now = time.time()
	elapsed = now - last_print_time
	overall_elapsed = now - t_start
	sps = steps_since / elapsed if elapsed > 0 else 0.0
	avg_sps = global_step / overall_elapsed if overall_elapsed > 0 else 0.0
	dev_stats = ""
	try:
	dev = next(model.parameters()).device
	if "cuda" in str(dev):
	dev_idx = torch.cuda.current_device()
	mem_alloc = torch.cuda.memory_allocated(dev_idx) / 1024**2
	mem_reserved = torch.cuda.memory_reserved(dev_idx) / 1024**2
	dev_name = torch.cuda.get_device_name(dev_idx)
	dev_stats = f" \| {dev_name} alloc={mem_alloc:.1f}MB reserved={mem_reserved:.1f}MB"
	else:
	dev_stats = f" \| device={dev}"
	except Exception:
	dev_stats = ""
	log += (f"[E{epoch+1}C{ci}] step {step}/{steps} loss={loss.item():.4f} "
	f"\| steps/s={sps:.2f} (avg {avg_sps:.2f}){dev_stats}\n")
	last_print_time = now
	steps_since = 0
	yield (log, plot_loss(loss_hist))

	del xb, yb, logits, loss

	# epoch end: save checkpoint
	ck = f"baby_gpt_epoch{epoch+1}.pth"
	try:
	save_checkpoint(model, {**cfg}, ck)
	log += f"💾 Saved checkpoint {ck}\n"
	except Exception as e:
	log += f"❌ Failed saving checkpoint {ck}: {e}\n"
	yield (log, plot_loss(loss_hist))

	# final save
	try:
	save_checkpoint(model, {**cfg}, "baby_gpt_final.pth")
	log += "\n🎉 Training complete! Saved baby_gpt_final.pth\n"
	except Exception as e:
	log += f"\n❌ Failed final save: {e}\n"
	yield (log, plot_loss(loss_hist))

	# ---------- minimal CLI ----------
	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--epochs", type=int, default=None)
	parser.add_argument("--vocab", type=int, default=None)
	parser.add_argument("--batch", type=int, default=None)
	parser.add_argument("--bs", type=int, default=None)
	parser.add_argument("--layers", type=int, default=None)
	parser.add_argument("--pretok", action="store_true", help="Force pretokenization")
	args = parser.parse_args()
	extra = {}
	if args.epochs: extra["EPOCHS"] = args.epochs
	if args.vocab: extra["SP_VOCAB"] = args.vocab
	if args.batch: extra["BATCH_SIZE"] = args.batch
	if args.bs: extra["BLOCK_SIZE"] = args.bs
	if args.layers: extra["NUM_LAYERS"] = args.layers
	if args.pretok: extra["PRETOKENIZE"] = True

	tg = train_generator(**extra)
	try:
	while True:
	out = next(tg)
	text, img = out
	os.write(1, text.encode("utf-8"))
	if img:
	with open("latest_loss.png", "wb") as f:
	f.write(img)
	except StopIteration:
	print("\nDone.")
	except KeyboardInterrupt:
	request_stop()
	print("\nInterrupted and requested stop.")