Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 5,260 Bytes
410e000 e34dc04 410e000 4df510c 410e000 4df510c 410e000 e34dc04 410e000 e34dc04 410e000 e34dc04 410e000 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | # -*- coding: utf-8 -*-
"""AniFileBERT β Google Colab Training Script
=============================================
How to use:
1. Open https://colab.research.google.com/
2. File β Upload notebook β select this file, OR
Copy the entire content into a new code cell
3. Runtime β Change runtime type β T4 GPU
4. Run all
What it does:
- Mounts Google Drive (for persistent checkpoints)
- Clones AniFileBERT repo + AnimeName dataset submodule
- Installs PyTorch + Transformers dependencies
- Runs training: fine-tune from current checkpoint with 8000-token vocab
- Saves final model to Drive
Output:
- Checkpoints saved to: MyDrive/AniFileBERT/checkpoints/
- Final model at: MyDrive/AniFileBERT/checkpoints/dmhy-finetune/final/
"""
import os
import sys
import subprocess
import time
def run(cmd, echo=True):
"""Run a shell command and print output in real time."""
if echo:
print(f"\n$ {cmd}")
proc = subprocess.Popen(
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
text=True, bufsize=1
)
for line in proc.stdout:
print(line, end="")
proc.wait()
if proc.returncode != 0:
raise RuntimeError(f"Command failed (exit code {proc.returncode}): {cmd}")
return proc.returncode
# ββ 1. Mount Google Drive ββββββββββββββββββββββββββββββββββββββ
print("=" * 60)
print("STEP 1: Mount Google Drive")
print("=" * 60)
from google.colab import drive
drive.mount("/content/drive")
DRIVE_ROOT = "/content/drive/MyDrive/AniFileBERT"
os.makedirs(DRIVE_ROOT, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_ROOT}")
# ββ 2. Clone repositories ββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 2: Clone AniFileBERT repository")
print("=" * 60)
REPO_DIR = "/content/AniFileBERT"
if not os.path.isdir(REPO_DIR):
os.chdir("/content")
run("git clone --recursive https://huggingface.co/ModerRAS/AniFileBERT")
else:
print("Repository already exists, pulling latest...")
os.chdir(REPO_DIR)
run("git pull")
run("git submodule update --init --recursive")
os.chdir(REPO_DIR)
# ββ 3. Install dependencies ββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 3: Install dependencies")
print("=" * 60)
# Colab comes with PyTorch + CUDA pre-installed. Just install the extras.
run("pip install transformers accelerate seqeval onnx onnxruntime onnxscript")
# ββ 4. Verify GPU ββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 4: Verify GPU")
print("=" * 60)
run("nvidia-smi 2>/dev/null || echo 'No GPU found β training will be slow on CPU'")
# Single-quote the shell command to avoid bash expanding {torch...}
run("python -c 'import torch; print(f\"PyTorch {torch.__version__}, CUDA available: {torch.cuda.is_available()}\")'")
# ββ 5. Verify vocab ββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 5: Verify vocabulary")
print("=" * 60)
run("python -c 'import json; v=json.load(open(\"vocab.json\")); print(f\"Vocab size: {len(v)} tokens\")'")
# ββ 6. Run training ββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 6: Train model")
print("=" * 60)
# The 8000-token vocab is already in datasets/AnimeName/vocab.json.
# The old checkpoint (3000-token embedding) gets resized automatically.
SAVE_DIR = os.path.join(DRIVE_ROOT, "checkpoints", "dmhy-finetune")
run(
f"python train.py "
f"--data-file datasets/AnimeName/dmhy_weak.jsonl "
f"--vocab-file datasets/AnimeName/vocab.json "
f"--save-dir {SAVE_DIR} "
f"--init-model-dir . "
f"--epochs 10 --batch-size 128 "
f"--learning-rate 0.0003 --warmup-steps 300 "
f"--seed 42 "
f"--no-shuffle"
)
# ββ 7. Export ONNX (optional) ββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("STEP 7: Export ONNX (optional β skip if it fails)")
print("=" * 60)
ONNX_OUT = os.path.join(SAVE_DIR, "..", "anime_filename_parser.onnx")
try:
run(
f"python export_onnx.py "
f"--model-dir {SAVE_DIR}/final "
f"--output {ONNX_OUT}"
)
except Exception as e:
print(f"[WARN] ONNX export skipped: {e}")
# ββ 8. Summary βββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 60)
print("DONE!")
print("=" * 60)
print(f"\nCheckpoints: {SAVE_DIR}/")
print(f"Final model: {SAVE_DIR}/final/")
print(f"ONNX export: {ONNX_OUT}")
print(f"\nAll files are on Google Drive β they persist across Colab sessions.")
print(f"You can also download them from the Drive web UI.")
|