AniFileBERT / config.py
ModerRAS's picture
Add AniFileBERT model and training project
be5f706
raw
history blame
2.04 kB
"""
Configuration parameters for the anime filename parser pipeline.
All hyperparameters are centralized here for easy tuning.
"""
from dataclasses import dataclass, field
@dataclass
class Config:
"""Central configuration dataclass for all pipeline parameters."""
# Data
synthetic_data_size: int = 100_000
train_split: float = 0.9
data_file: str = "data/synthetic.jsonl"
# Model architecture
hidden_size: int = 256
num_hidden_layers: int = 4
num_attention_heads: int = 8
intermediate_size: int = 1024
max_position_embeddings: int = 128
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
# Training hyperparameters
batch_size: int = 64
learning_rate: float = 1e-3
num_epochs: int = 8
weight_decay: float = 0.01
warmup_steps: int = 500
# System
device: str = "cpu"
num_workers: int = 0
save_dir: str = "./checkpoints"
log_interval: int = 100
# Sequence
max_seq_length: int = 64
# Vocabulary (set dynamically from tokenizer)
vocab_size: int = 3000 # placeholder, overridden after tokenizer vocab is built
# Special tokens
pad_token: str = "[PAD]"
unk_token: str = "[UNK]"
cls_token: str = "[CLS]"
sep_token: str = "[SEP]"
# BIO label scheme (8 entity types + O)
label2id: dict = None
id2label: dict = None
def __post_init__(self):
if self.label2id is None:
self.label2id = {
"O": 0,
"B-TITLE": 1, "I-TITLE": 2,
"B-SEASON": 3, "I-SEASON": 4,
"B-EPISODE": 5, "I-EPISODE": 6,
"B-SPECIAL": 7, "I-SPECIAL": 8,
"B-GROUP": 9, "I-GROUP": 10,
"B-RESOLUTION": 11, "I-RESOLUTION": 12,
"B-SOURCE": 13, "I-SOURCE": 14,
}
if self.id2label is None:
self.id2label = {v: k for k, v in self.label2id.items()}
@property
def num_labels(self) -> int:
return len(self.label2id)