| """ |
| Configuration for Finance-SLM: A Small Language Model for Finance & Banking domain. |
| LLaMA-style architecture with RoPE — supports up to 1M token context. |
| """ |
|
|
| from dataclasses import dataclass, field |
| from pathlib import Path |
| from typing import Optional |
|
|
|
|
| @dataclass |
| class SLMConfig: |
| project_dir: Path = Path(__file__).resolve().parent |
| data_dir: Path = field(default=None) |
| tokenizer_dir: Path = field(default=None) |
| checkpoint_dir: Path = field(default=None) |
|
|
| domain_name: str = "Finance" |
| domain_slug: str = "finance" |
| tokenizer_filename: str = "finance_tokenizer.json" |
|
|
| vocab_size: int = 16_000 |
| min_frequency: int = 2 |
| special_tokens: list = field(default_factory=lambda: [ |
| "<pad>", "<unk>", "<bos>", "<eos>", "<|system|>", "<|user|>", "<|assistant|>", |
| ]) |
|
|
| n_layer: int = 8 |
| n_head: int = 8 |
| n_embd: int = 512 |
| block_size: int = 512 |
| dropout: float = 0.1 |
| bias: bool = False |
| ffn_multiplier: float = 2.667 |
| max_position_embeddings: int = 100_000_000_000 |
| rope_theta: float = 50_000_000_000.0 |
| sliding_window: Optional[int] = None |
|
|
| batch_size: int = 4 |
| gradient_accumulation_steps: int = 4 |
| learning_rate: float = 3e-4 |
| weight_decay: float = 0.1 |
| max_epochs: int = 20 |
| dataset_stride: int = 256 |
| warmup_steps: int = 20 |
| grad_clip: float = 1.0 |
| eval_interval: int = 50 |
| eval_samples: int = 20 |
| log_interval: int = 10 |
| device: str = "auto" |
|
|
| max_new_tokens: int = 1_000_000 |
| temperature: float = 0.8 |
| top_k: int = 50 |
| top_p: float = 0.9 |
|
|
| hf_repo_name: str = "finance-slm-1m" |
| hf_model_card_tags: list = field(default_factory=lambda: [ |
| "finance", "banking", "fintech", "trading", "slm", |
| "llama-style", "rope", "1m-context", "from-scratch", |
| ]) |
|
|
| def __post_init__(self): |
| if self.data_dir is None: |
| self.data_dir = self.project_dir / "data" |
| if self.tokenizer_dir is None: |
| self.tokenizer_dir = self.project_dir / "tokenizer" |
| if self.checkpoint_dir is None: |
| self.checkpoint_dir = self.project_dir / "checkpoints" |
| self.data_dir.mkdir(parents=True, exist_ok=True) |
| self.tokenizer_dir.mkdir(parents=True, exist_ok=True) |
| self.checkpoint_dir.mkdir(parents=True, exist_ok=True) |
| if self.device == "auto": |
| import torch |
| if torch.cuda.is_available(): self.device = "cuda" |
| elif torch.backends.mps.is_available(): self.device = "mps" |
| else: self.device = "cpu" |
|
|
|
|
| cfg = SLMConfig() |
|
|