Token Classification
Transformers
ONNX
Safetensors
English
Japanese
Chinese
bert
anime
filename-parsing
Eval Results (legacy)
Instructions to use ModerRAS/AniFileBERT with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ModerRAS/AniFileBERT with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="ModerRAS/AniFileBERT")# Load model directly from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ModerRAS/AniFileBERT") model = AutoModelForTokenClassification.from_pretrained("ModerRAS/AniFileBERT") - Notebooks
- Google Colab
- Kaggle
| """Smoke test for the full training pipeline.""" | |
| import json | |
| import os | |
| import torch | |
| from config import Config | |
| from tokenizer import AnimeTokenizer | |
| from model import create_model, count_parameters | |
| from dataset import AnimeDataset | |
| cfg = Config() | |
| # Load tokenizer | |
| tok = AnimeTokenizer(vocab_file='data/vocab.json') | |
| cfg.vocab_size = tok.vocab_size | |
| print(f'Vocab: {tok.vocab_size}, Labels: {cfg.num_labels}') | |
| # Create model | |
| model = create_model(cfg) | |
| total_params = count_parameters(model) | |
| print(f'Model params: {total_params:,} / 5M limit') | |
| assert total_params < 5_000_000, f'Model too large: {total_params:,}' | |
| # Load a tiny dataset | |
| with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f: | |
| samples = [json.loads(line) for line in f][:100] | |
| temp_file = 'data/test_smoke.jsonl' | |
| with open(temp_file, 'w', encoding='utf-8') as f: | |
| for s in samples: | |
| f.write(json.dumps(s, ensure_ascii=False) + '\n') | |
| ds = AnimeDataset(temp_file, tok, cfg.label2id, cfg.max_seq_length) | |
| print(f'Dataset: {len(ds)} samples') | |
| sample = ds[0] | |
| print(f'Input IDs shape: {sample["input_ids"].shape}') | |
| print(f'Labels shape: {sample["labels"].shape}') | |
| print(f'Attention mask shape: {sample["attention_mask"].shape}') | |
| # Forward pass | |
| with torch.no_grad(): | |
| out = model( | |
| input_ids=sample['input_ids'].unsqueeze(0), | |
| attention_mask=sample['attention_mask'].unsqueeze(0), | |
| labels=sample['labels'].unsqueeze(0), | |
| ) | |
| print(f'Loss: {out.loss.item():.4f}') | |
| print(f'Logits shape: {out.logits.shape}') | |
| print() | |
| print('Smoke test PASSED!') | |
| print(f'Model is ready for training: {total_params:,} params < 5M [OK]') | |