AniFileBERT / colab_train.py

colab: onnx导出改为非阻塞+补全onnxscript

e34dc04 13 days ago

5.26 kB

	# -- coding: utf-8 --
	"""AniFileBERT — Google Colab Training Script
	=============================================

	How to use:
	1. Open https://colab.research.google.com/
	2. File → Upload notebook → select this file, OR
	Copy the entire content into a new code cell
	3. Runtime → Change runtime type → T4 GPU
	4. Run all

	What it does:
	- Mounts Google Drive (for persistent checkpoints)
	- Clones AniFileBERT repo + AnimeName dataset submodule
	- Installs PyTorch + Transformers dependencies
	- Runs training: fine-tune from current checkpoint with 8000-token vocab
	- Saves final model to Drive

	Output:
	- Checkpoints saved to: MyDrive/AniFileBERT/checkpoints/
	- Final model at: MyDrive/AniFileBERT/checkpoints/dmhy-finetune/final/
	"""

	import os
	import sys
	import subprocess
	import time


	def run(cmd, echo=True):
	"""Run a shell command and print output in real time."""
	if echo:
	print(f"\n$ {cmd}")
	proc = subprocess.Popen(
	cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
	text=True, bufsize=1
	)
	for line in proc.stdout:
	print(line, end="")
	proc.wait()
	if proc.returncode != 0:
	raise RuntimeError(f"Command failed (exit code {proc.returncode}): {cmd}")
	return proc.returncode


	# ── 1. Mount Google Drive ──────────────────────────────────────
	print("=" * 60)
	print("STEP 1: Mount Google Drive")
	print("=" * 60)
	from google.colab import drive
	drive.mount("/content/drive")

	DRIVE_ROOT = "/content/drive/MyDrive/AniFileBERT"
	os.makedirs(DRIVE_ROOT, exist_ok=True)
	print(f"Checkpoints will be saved to: {DRIVE_ROOT}")

	# ── 2. Clone repositories ──────────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 2: Clone AniFileBERT repository")
	print("=" * 60)

	REPO_DIR = "/content/AniFileBERT"
	if not os.path.isdir(REPO_DIR):
	os.chdir("/content")
	run("git clone --recursive https://huggingface.co/ModerRAS/AniFileBERT")
	else:
	print("Repository already exists, pulling latest...")
	os.chdir(REPO_DIR)
	run("git pull")
	run("git submodule update --init --recursive")

	os.chdir(REPO_DIR)

	# ── 3. Install dependencies ────────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 3: Install dependencies")
	print("=" * 60)
	# Colab comes with PyTorch + CUDA pre-installed. Just install the extras.
	run("pip install transformers accelerate seqeval onnx onnxruntime onnxscript")

	# ── 4. Verify GPU ──────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 4: Verify GPU")
	print("=" * 60)
	run("nvidia-smi 2>/dev/null \|\| echo 'No GPU found — training will be slow on CPU'")
	# Single-quote the shell command to avoid bash expanding {torch...}
	run("python -c 'import torch; print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")'")

	# ── 5. Verify vocab ────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 5: Verify vocabulary")
	print("=" * 60)
	run("python -c 'import json; v=json.load(open(\"vocab.json\")); print(f\"Vocab size: {len(v)} tokens\")'")

	# ── 6. Run training ────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 6: Train model")
	print("=" * 60)

	# The 8000-token vocab is already in datasets/AnimeName/vocab.json.
	# The old checkpoint (3000-token embedding) gets resized automatically.
	SAVE_DIR = os.path.join(DRIVE_ROOT, "checkpoints", "dmhy-finetune")

	run(
	f"python train.py "
	f"--data-file datasets/AnimeName/dmhy_weak.jsonl "
	f"--vocab-file datasets/AnimeName/vocab.json "
	f"--save-dir {SAVE_DIR} "
	f"--init-model-dir . "
	f"--epochs 10 --batch-size 128 "
	f"--learning-rate 0.0003 --warmup-steps 300 "
	f"--seed 42 "
	f"--no-shuffle"
	)

	# ── 7. Export ONNX (optional) ──────────────────────────────────
	print("\n" + "=" * 60)
	print("STEP 7: Export ONNX (optional — skip if it fails)")
	print("=" * 60)
	ONNX_OUT = os.path.join(SAVE_DIR, "..", "anime_filename_parser.onnx")
	try:
	run(
	f"python export_onnx.py "
	f"--model-dir {SAVE_DIR}/final "
	f"--output {ONNX_OUT}"
	)
	except Exception as e:
	print(f"[WARN] ONNX export skipped: {e}")

	# ── 8. Summary ─────────────────────────────────────────────────
	print("\n" + "=" * 60)
	print("DONE!")
	print("=" * 60)
	print(f"\nCheckpoints: {SAVE_DIR}/")
	print(f"Final model: {SAVE_DIR}/final/")
	print(f"ONNX export: {ONNX_OUT}")
	print(f"\nAll files are on Google Drive — they persist across Colab sessions.")
	print(f"You can also download them from the Drive web UI.")