File size: 5,260 Bytes

# -*- coding: utf-8 -*-
"""AniFileBERT — Google Colab Training Script
=============================================

How to use:
  1. Open https://colab.research.google.com/
  2. File → Upload notebook → select this file, OR
     Copy the entire content into a new code cell
  3. Runtime → Change runtime type → T4 GPU
  4. Run all

What it does:
  - Mounts Google Drive (for persistent checkpoints)
  - Clones AniFileBERT repo + AnimeName dataset submodule
  - Installs PyTorch + Transformers dependencies
  - Runs training: fine-tune from current checkpoint with 8000-token vocab
  - Saves final model to Drive

Output:
  - Checkpoints saved to: MyDrive/AniFileBERT/checkpoints/
  - Final model at:       MyDrive/AniFileBERT/checkpoints/dmhy-finetune/final/
"""

import os
import sys
import subprocess
import time


def run(cmd, echo=True):
    """Run a shell command and print output in real time."""
    if echo:
        print(f"\n$ {cmd}")
    proc = subprocess.Popen(
        cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
        text=True, bufsize=1
    )
    for line in proc.stdout:
        print(line, end="")
    proc.wait()
    if proc.returncode != 0:
        raise RuntimeError(f"Command failed (exit code {proc.returncode}): {cmd}")
    return proc.returncode


# ── 1. Mount Google Drive ──────────────────────────────────────
print("=" * 60)
print("STEP 1: Mount Google Drive")
print("=" * 60)
from google.colab import drive
drive.mount("/content/drive")

DRIVE_ROOT = "/content/drive/MyDrive/AniFileBERT"
os.makedirs(DRIVE_ROOT, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_ROOT}")

# ── 2. Clone repositories ──────────────────────────────────────
print("\n" + "=" * 60)
print("STEP 2: Clone AniFileBERT repository")
print("=" * 60)

REPO_DIR = "/content/AniFileBERT"
if not os.path.isdir(REPO_DIR):
    os.chdir("/content")
    run("git clone --recursive https://huggingface.co/ModerRAS/AniFileBERT")
else:
    print("Repository already exists, pulling latest...")
    os.chdir(REPO_DIR)
    run("git pull")
    run("git submodule update --init --recursive")

os.chdir(REPO_DIR)

# ── 3. Install dependencies ────────────────────────────────────
print("\n" + "=" * 60)
print("STEP 3: Install dependencies")
print("=" * 60)
# Colab comes with PyTorch + CUDA pre-installed. Just install the extras.
run("pip install transformers accelerate seqeval onnx onnxruntime onnxscript")

# ── 4. Verify GPU ──────────────────────────────────────────────
print("\n" + "=" * 60)
print("STEP 4: Verify GPU")
print("=" * 60)
run("nvidia-smi 2>/dev/null || echo 'No GPU found — training will be slow on CPU'")
# Single-quote the shell command to avoid bash expanding {torch...}
run("python -c 'import torch; print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")'")

# ── 5. Verify vocab ────────────────────────────────────────────
print("\n" + "=" * 60)
print("STEP 5: Verify vocabulary")
print("=" * 60)
run("python -c 'import json; v=json.load(open(\"vocab.json\")); print(f\"Vocab size: {len(v)} tokens\")'")

# ── 6. Run training ────────────────────────────────────────────
print("\n" + "=" * 60)
print("STEP 6: Train model")
print("=" * 60)

# The 8000-token vocab is already in datasets/AnimeName/vocab.json.
# The old checkpoint (3000-token embedding) gets resized automatically.
SAVE_DIR = os.path.join(DRIVE_ROOT, "checkpoints", "dmhy-finetune")

run(
    f"python train.py "
    f"--data-file datasets/AnimeName/dmhy_weak.jsonl "
    f"--vocab-file datasets/AnimeName/vocab.json "
    f"--save-dir {SAVE_DIR} "
    f"--init-model-dir . "
    f"--epochs 10 --batch-size 128 "
    f"--learning-rate 0.0003 --warmup-steps 300 "
    f"--seed 42 "
    f"--no-shuffle"
)

# ── 7. Export ONNX (optional) ──────────────────────────────────
print("\n" + "=" * 60)
print("STEP 7: Export ONNX (optional — skip if it fails)")
print("=" * 60)
ONNX_OUT = os.path.join(SAVE_DIR, "..", "anime_filename_parser.onnx")
try:
    run(
        f"python export_onnx.py "
        f"--model-dir {SAVE_DIR}/final "
        f"--output {ONNX_OUT}"
    )
except Exception as e:
    print(f"[WARN] ONNX export skipped: {e}")

# ── 8. Summary ─────────────────────────────────────────────────
print("\n" + "=" * 60)
print("DONE!")
print("=" * 60)
print(f"\nCheckpoints:  {SAVE_DIR}/")
print(f"Final model:  {SAVE_DIR}/final/")
print(f"ONNX export:  {ONNX_OUT}")
print(f"\nAll files are on Google Drive — they persist across Colab sessions.")
print(f"You can also download them from the Drive web UI.")