abpt / src /model /config.py
Search
auto: sync run_testformer_wikitext_combo_remote.py
f37be5a
from dataclasses import dataclass
@dataclass
class ModelConfig:
# Backbone
vocab_size: int = 8192
d_model: int = 256
n_heads: int = 4
n_layers: int = 6
d_ff: int = 512
max_seq_len: int = 512
dropout: float = 0.1
# Attention Residuals
use_attn_res: bool = True
# Branching
use_branches: bool = True
n_branches: int = 2
diversity_weight: float = 0.1
branch_diversity_target: float = 0.08
# Verifier
use_verifier: bool = True
verifier_entropy_weight: float = 0.4
verifier_agreement_weight: float = 0.4
verifier_consistency_weight: float = 0.2
verifier_temperature: float = 4.0
# Plastic Layer
use_plastic: bool = True
plastic_lr: float = 1e-4
plastic_decay: float = 0.99
plastic_l2_weight: float = 0.01
plastic_hidden: int = 64
plastic_noise_scale: float = 0.05
plastic_mask_ratio: float = 0.15
plastic_train_updates: int = 1
# Equilibrium / Routing (Phase 0)
eq_momentum: float = 0.1
eq_warmup_steps: int = 50
router_lr: float = 3e-5
router_warmup_steps: int = 500
router_entropy_weight: float = 0.01
route_temperature: float = 8.0
route_threshold_momentum: float = 0.2
route_threshold_offset_scale: float = 0.2
route_forward_target: float = 0.55
route_branch_target: float = 0.25
route_backward_target: float = 0.15
route_plastic_target: float = 0.05
# Anchor V1
use_fog_flow: bool = False
fog_task_profile: str = "auto"
fog_compare_ratio: float = 0.25
fog_memory_ratio: float = 0.75
fog_expand_ratio: float = 2.0
fog_gate_ratio: float = 0.125
anchor_prior_weight: float = 1.0
anchor_runtime_weight: float = 1.0
anchor_threshold: float = 0.65
anchor_domain_mode: str = "auto"
anchor_max_candidates: int = 6
anchor_ttl_init: float = 4.0
anchor_support_decay: float = 0.9
anchor_candidate_promote_threshold: float = 0.55
anchor_confirm_threshold: float = 0.7
anchor_revision_threshold: float = 0.35
anchor_contradiction_threshold: float = 0.65
anchor_dead_end_threshold: float = 0.85
anchor_arbiter_beta: float = 8.0
anchor_arbiter_revise_threshold: float = 0.45
anchor_revision_temperature: float = 1.0
anchor_viability_alpha: float = 2.0
anchor_viability_beta: float = 2.5
anchor_age_gamma: float = 0.25
anchor_descendant_mass_delta: float = 0.75
anchor_descendant_coherence_eta: float = 0.75
anchor_detector_alignment_weight: float = 0.05
anchor_context_stability_weight: float = 0.01
anchor_dependency_threshold: float = 0.55
anchor_dependency_confirm_slope: float = 0.10
anchor_dependency_temporal_window: float = 16.0
anchor_dependency_similarity_weight: float = 0.55
anchor_dependency_temporal_weight: float = 0.20
anchor_dependency_support_weight: float = 0.15
anchor_dependency_viability_weight: float = 0.10
anchor_dependency_max_predecessors: int = 4
anchor_dependency_counterfactual_top_edges: int = 0
anchor_dependency_future_window: int = 16
anchor_context_min_viability: float = 0.30
anchor_use_future_proposal_head: bool = True
anchor_future_proposal_trigger: float = 0.35
anchor_future_proposal_hidden: int = 64
anchor_future_proposal_threshold: float = 0.58
anchor_future_proposal_temperature: float = 0.75
anchor_future_proposal_horizon_scale: float = 4.0
anchor_future_proposal_span_scale: float = 4.0
anchor_future_proposal_max_horizon: int = 32
anchor_future_proposal_max_windows: int = 48
anchor_future_proposal_topk: int = 4
anchor_future_proposal_residual_scale: float = 0.10
anchor_proposal_score_weight: float = 0.05
anchor_proposal_margin_weight: float = 0.05
anchor_proposal_alignment_weight: float = 0.02
anchor_proposal_counterfactual_weight: float = 0.05
anchor_proposal_margin_target: float = 0.05
anchor_proposal_target_temperature: float = 0.15
anchor_proposal_counterfactual_margin: float = 0.02
anchor_proposal_counterfactual_window: int = 4
anchor_use_proposal_rollout: bool = True
anchor_proposal_rollout_steps: int = 4
anchor_proposal_rollout_hidden: int = 64
anchor_proposal_rollout_weight: float = 0.05
anchor_proposal_rollout_margin: float = 0.02
anchor_proposal_rollout_residual_scale: float = 0.15
anchor_proposal_rollout_pressure_trigger: float = 0.45
anchor_proposal_rollout_score_trigger: float = 0.90
# Training
learning_rate: float = 3e-4
weight_decay: float = 0.01
warmup_steps: int = 100
max_steps: int = 5000
batch_size: int = 32
eval_interval: int = 100
gradient_clip: float = 1.0
# Ablation presets
BASELINE_0 = ModelConfig(
use_attn_res=False, use_branches=False,
use_verifier=False, use_plastic=False,
)
BASELINE_1_ATTNRES = ModelConfig(
use_attn_res=True, use_branches=False,
use_verifier=False, use_plastic=False,
)
BASELINE_2_BRANCHES = ModelConfig(
use_attn_res=True, use_branches=True,
use_verifier=True, use_plastic=False,
)
BASELINE_3_PLASTIC = ModelConfig(
use_attn_res=True, use_branches=False,
use_verifier=False, use_plastic=True,
)
FULL_MODEL = ModelConfig(
use_attn_res=True, use_branches=True,
use_verifier=True, use_plastic=True,
)
TOY_CONFIG = ModelConfig(
vocab_size=512, d_model=64, n_heads=2,
n_layers=3, d_ff=128, max_seq_len=128,
plastic_hidden=16,
)
SCALEUP_CONFIG = ModelConfig(
vocab_size=512, d_model=512, n_heads=8,
n_layers=4, d_ff=1024, max_seq_len=128,
plastic_hidden=128,
anchor_threshold=0.2, anchor_ttl_init=4.0,
anchor_dead_end_threshold=0.5
)
PRESETS = {
"baseline-0": BASELINE_0,
"baseline-1-attnres": BASELINE_1_ATTNRES,
"baseline-2-branches": BASELINE_2_BRANCHES,
"baseline-3-plastic": BASELINE_3_PLASTIC,
"full": FULL_MODEL,
"toy": TOY_CONFIG,
"scaleup": SCALEUP_CONFIG,
}