ModerRAS
/

AniFileBERT

Token Classification

filename-parsing

Eval Results (legacy)

Model card Files Files and versions

AniFileBERT / config.py

ModerRAS's picture

Add AniFileBERT model and training project

be5f706 12 days ago

2.04 kB

	"""
	Configuration parameters for the anime filename parser pipeline.
	All hyperparameters are centralized here for easy tuning.
	"""


	from dataclasses import dataclass, field


	@dataclass
	class Config:
	"""Central configuration dataclass for all pipeline parameters."""

	# Data
	synthetic_data_size: int = 100_000
	train_split: float = 0.9
	data_file: str = "data/synthetic.jsonl"

	# Model architecture
	hidden_size: int = 256
	num_hidden_layers: int = 4
	num_attention_heads: int = 8
	intermediate_size: int = 1024
	max_position_embeddings: int = 128
	hidden_dropout_prob: float = 0.1
	attention_probs_dropout_prob: float = 0.1

	# Training hyperparameters
	batch_size: int = 64
	learning_rate: float = 1e-3
	num_epochs: int = 8
	weight_decay: float = 0.01
	warmup_steps: int = 500

	# System
	device: str = "cpu"
	num_workers: int = 0
	save_dir: str = "./checkpoints"
	log_interval: int = 100

	# Sequence
	max_seq_length: int = 64

	# Vocabulary (set dynamically from tokenizer)
	vocab_size: int = 3000 # placeholder, overridden after tokenizer vocab is built

	# Special tokens
	pad_token: str = "[PAD]"
	unk_token: str = "[UNK]"
	cls_token: str = "[CLS]"
	sep_token: str = "[SEP]"

	# BIO label scheme (8 entity types + O)
	label2id: dict = None
	id2label: dict = None

	def __post_init__(self):
	if self.label2id is None:
	self.label2id = {
	"O": 0,
	"B-TITLE": 1, "I-TITLE": 2,
	"B-SEASON": 3, "I-SEASON": 4,
	"B-EPISODE": 5, "I-EPISODE": 6,
	"B-SPECIAL": 7, "I-SPECIAL": 8,
	"B-GROUP": 9, "I-GROUP": 10,
	"B-RESOLUTION": 11, "I-RESOLUTION": 12,
	"B-SOURCE": 13, "I-SOURCE": 14,
	}
	if self.id2label is None:
	self.id2label = {v: k for k, v in self.label2id.items()}

	@property
	def num_labels(self) -> int:
	return len(self.label2id)