Spaces:
Sleeping
Sleeping
| from dataclasses import dataclass | |
| class ModelConfig: | |
| # Backbone | |
| vocab_size: int = 8192 | |
| d_model: int = 256 | |
| n_heads: int = 4 | |
| n_layers: int = 6 | |
| d_ff: int = 512 | |
| max_seq_len: int = 512 | |
| dropout: float = 0.1 | |
| # Attention Residuals | |
| use_attn_res: bool = True | |
| # Branching | |
| use_branches: bool = True | |
| n_branches: int = 2 | |
| diversity_weight: float = 0.1 | |
| branch_diversity_target: float = 0.08 | |
| # Verifier | |
| use_verifier: bool = True | |
| verifier_entropy_weight: float = 0.4 | |
| verifier_agreement_weight: float = 0.4 | |
| verifier_consistency_weight: float = 0.2 | |
| verifier_temperature: float = 4.0 | |
| # Plastic Layer | |
| use_plastic: bool = True | |
| plastic_lr: float = 1e-4 | |
| plastic_decay: float = 0.99 | |
| plastic_l2_weight: float = 0.01 | |
| plastic_hidden: int = 64 | |
| plastic_noise_scale: float = 0.05 | |
| plastic_mask_ratio: float = 0.15 | |
| plastic_train_updates: int = 1 | |
| # Equilibrium / Routing (Phase 0) | |
| eq_momentum: float = 0.1 | |
| eq_warmup_steps: int = 50 | |
| router_lr: float = 3e-5 | |
| router_warmup_steps: int = 500 | |
| router_entropy_weight: float = 0.01 | |
| route_temperature: float = 8.0 | |
| route_threshold_momentum: float = 0.2 | |
| route_threshold_offset_scale: float = 0.2 | |
| route_forward_target: float = 0.55 | |
| route_branch_target: float = 0.25 | |
| route_backward_target: float = 0.15 | |
| route_plastic_target: float = 0.05 | |
| # Anchor V1 | |
| use_fog_flow: bool = False | |
| fog_task_profile: str = "auto" | |
| fog_compare_ratio: float = 0.25 | |
| fog_memory_ratio: float = 0.75 | |
| fog_expand_ratio: float = 2.0 | |
| fog_gate_ratio: float = 0.125 | |
| anchor_prior_weight: float = 1.0 | |
| anchor_runtime_weight: float = 1.0 | |
| anchor_threshold: float = 0.65 | |
| anchor_domain_mode: str = "auto" | |
| anchor_max_candidates: int = 6 | |
| anchor_ttl_init: float = 4.0 | |
| anchor_support_decay: float = 0.9 | |
| anchor_candidate_promote_threshold: float = 0.55 | |
| anchor_confirm_threshold: float = 0.7 | |
| anchor_revision_threshold: float = 0.35 | |
| anchor_contradiction_threshold: float = 0.65 | |
| anchor_dead_end_threshold: float = 0.85 | |
| anchor_arbiter_beta: float = 8.0 | |
| anchor_arbiter_revise_threshold: float = 0.45 | |
| anchor_revision_temperature: float = 1.0 | |
| anchor_viability_alpha: float = 2.0 | |
| anchor_viability_beta: float = 2.5 | |
| anchor_age_gamma: float = 0.25 | |
| anchor_descendant_mass_delta: float = 0.75 | |
| anchor_descendant_coherence_eta: float = 0.75 | |
| anchor_detector_alignment_weight: float = 0.05 | |
| anchor_context_stability_weight: float = 0.01 | |
| anchor_dependency_threshold: float = 0.55 | |
| anchor_dependency_confirm_slope: float = 0.10 | |
| anchor_dependency_temporal_window: float = 16.0 | |
| anchor_dependency_similarity_weight: float = 0.55 | |
| anchor_dependency_temporal_weight: float = 0.20 | |
| anchor_dependency_support_weight: float = 0.15 | |
| anchor_dependency_viability_weight: float = 0.10 | |
| anchor_dependency_max_predecessors: int = 4 | |
| anchor_dependency_counterfactual_top_edges: int = 0 | |
| anchor_dependency_future_window: int = 16 | |
| anchor_context_min_viability: float = 0.30 | |
| anchor_use_future_proposal_head: bool = True | |
| anchor_future_proposal_trigger: float = 0.35 | |
| anchor_future_proposal_hidden: int = 64 | |
| anchor_future_proposal_threshold: float = 0.58 | |
| anchor_future_proposal_temperature: float = 0.75 | |
| anchor_future_proposal_horizon_scale: float = 4.0 | |
| anchor_future_proposal_span_scale: float = 4.0 | |
| anchor_future_proposal_max_horizon: int = 32 | |
| anchor_future_proposal_max_windows: int = 48 | |
| anchor_future_proposal_topk: int = 4 | |
| anchor_future_proposal_residual_scale: float = 0.10 | |
| anchor_proposal_score_weight: float = 0.05 | |
| anchor_proposal_margin_weight: float = 0.05 | |
| anchor_proposal_alignment_weight: float = 0.02 | |
| anchor_proposal_counterfactual_weight: float = 0.05 | |
| anchor_proposal_margin_target: float = 0.05 | |
| anchor_proposal_target_temperature: float = 0.15 | |
| anchor_proposal_counterfactual_margin: float = 0.02 | |
| anchor_proposal_counterfactual_window: int = 4 | |
| anchor_use_proposal_rollout: bool = True | |
| anchor_proposal_rollout_steps: int = 4 | |
| anchor_proposal_rollout_hidden: int = 64 | |
| anchor_proposal_rollout_weight: float = 0.05 | |
| anchor_proposal_rollout_margin: float = 0.02 | |
| anchor_proposal_rollout_residual_scale: float = 0.15 | |
| anchor_proposal_rollout_pressure_trigger: float = 0.45 | |
| anchor_proposal_rollout_score_trigger: float = 0.90 | |
| # Training | |
| learning_rate: float = 3e-4 | |
| weight_decay: float = 0.01 | |
| warmup_steps: int = 100 | |
| max_steps: int = 5000 | |
| batch_size: int = 32 | |
| eval_interval: int = 100 | |
| gradient_clip: float = 1.0 | |
| # Ablation presets | |
| BASELINE_0 = ModelConfig( | |
| use_attn_res=False, use_branches=False, | |
| use_verifier=False, use_plastic=False, | |
| ) | |
| BASELINE_1_ATTNRES = ModelConfig( | |
| use_attn_res=True, use_branches=False, | |
| use_verifier=False, use_plastic=False, | |
| ) | |
| BASELINE_2_BRANCHES = ModelConfig( | |
| use_attn_res=True, use_branches=True, | |
| use_verifier=True, use_plastic=False, | |
| ) | |
| BASELINE_3_PLASTIC = ModelConfig( | |
| use_attn_res=True, use_branches=False, | |
| use_verifier=False, use_plastic=True, | |
| ) | |
| FULL_MODEL = ModelConfig( | |
| use_attn_res=True, use_branches=True, | |
| use_verifier=True, use_plastic=True, | |
| ) | |
| TOY_CONFIG = ModelConfig( | |
| vocab_size=512, d_model=64, n_heads=2, | |
| n_layers=3, d_ff=128, max_seq_len=128, | |
| plastic_hidden=16, | |
| ) | |
| SCALEUP_CONFIG = ModelConfig( | |
| vocab_size=512, d_model=512, n_heads=8, | |
| n_layers=4, d_ff=1024, max_seq_len=128, | |
| plastic_hidden=128, | |
| anchor_threshold=0.2, anchor_ttl_init=4.0, | |
| anchor_dead_end_threshold=0.5 | |
| ) | |
| PRESETS = { | |
| "baseline-0": BASELINE_0, | |
| "baseline-1-attnres": BASELINE_1_ATTNRES, | |
| "baseline-2-branches": BASELINE_2_BRANCHES, | |
| "baseline-3-plastic": BASELINE_3_PLASTIC, | |
| "full": FULL_MODEL, | |
| "toy": TOY_CONFIG, | |
| "scaleup": SCALEUP_CONFIG, | |
| } | |