Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Instructions to use chivehao/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use chivehao/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="chivehao/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("chivehao/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("chivehao/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
File size: 1,616 Bytes
f7b1036 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | """Smoke test for the full training pipeline."""
import json
import os
import torch
from config import Config
from tokenizer import AnimeTokenizer
from model import create_model, count_parameters
from dataset import AnimeDataset
cfg = Config()
# Load tokenizer
tok = AnimeTokenizer(vocab_file='data/vocab.json')
cfg.vocab_size = tok.vocab_size
print(f'Vocab: {tok.vocab_size}, Labels: {cfg.num_labels}')
# Create model
model = create_model(cfg)
total_params = count_parameters(model)
print(f'Model params: {total_params:,} / 5M limit')
assert total_params < 5_000_000, f'Model too large: {total_params:,}'
# Load a tiny dataset
with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
samples = [json.loads(line) for line in f][:100]
temp_file = 'data/test_smoke.jsonl'
with open(temp_file, 'w', encoding='utf-8') as f:
for s in samples:
f.write(json.dumps(s, ensure_ascii=False) + '\n')
ds = AnimeDataset(temp_file, tok, cfg.label2id, cfg.max_seq_length)
print(f'Dataset: {len(ds)} samples')
sample = ds[0]
print(f'Input IDs shape: {sample["input_ids"].shape}')
print(f'Labels shape: {sample["labels"].shape}')
print(f'Attention mask shape: {sample["attention_mask"].shape}')
# Forward pass
with torch.no_grad():
out = model(
input_ids=sample['input_ids'].unsqueeze(0),
attention_mask=sample['attention_mask'].unsqueeze(0),
labels=sample['labels'].unsqueeze(0),
)
print(f'Loss: {out.loss.item():.4f}')
print(f'Logits shape: {out.logits.shape}')
print()
print('Smoke test PASSED!')
print(f'Model is ready for training: {total_params:,} params < 5M [OK]')
|