AniFileBERT / anifilebert /config.py
ModerRAS's picture
Organize parser modules and tools
8c50d16
"""
Configuration parameters for the anime filename parser pipeline.
All hyperparameters are centralized here for easy tuning.
"""
from dataclasses import dataclass, field
@dataclass
class Config:
"""Central configuration dataclass for all pipeline parameters."""
# Data
synthetic_data_size: int = 100_000
train_split: float = 0.9
data_file: str = "data/synthetic.jsonl"
# Model architecture
hidden_size: int = 256
num_hidden_layers: int = 4
num_attention_heads: int = 8
intermediate_size: int = 1024
max_position_embeddings: int = 128
hidden_dropout_prob: float = 0.1
attention_probs_dropout_prob: float = 0.1
# Training hyperparameters
batch_size: int = 64
learning_rate: float = 1e-3
num_epochs: int = 8
weight_decay: float = 0.01
warmup_steps: int = 500
# System
device: str = "cpu"
num_workers: int = 4
save_dir: str = "./checkpoints"
log_interval: int = 100
# Sequence
max_seq_length: int = 64
# Vocabulary (set dynamically from tokenizer)
vocab_size: int = 8000 # placeholder, overridden after tokenizer vocab is built
# Special tokens
pad_token: str = "[PAD]"
unk_token: str = "[UNK]"
cls_token: str = "[CLS]"
sep_token: str = "[SEP]"
# BIO label scheme (8 entity types + O)
label2id: dict = None
id2label: dict = None
def __post_init__(self):
if self.label2id is None:
self.label2id = {
"O": 0,
"B-TITLE": 1, "I-TITLE": 2,
"B-SEASON": 3, "I-SEASON": 4,
"B-EPISODE": 5, "I-EPISODE": 6,
"B-SPECIAL": 7, "I-SPECIAL": 8,
"B-GROUP": 9, "I-GROUP": 10,
"B-RESOLUTION": 11, "I-RESOLUTION": 12,
"B-SOURCE": 13, "I-SOURCE": 14,
}
if self.id2label is None:
self.id2label = {v: k for k, v in self.label2id.items()}
@property
def num_labels(self) -> int:
return len(self.label2id)