Spaces:

kharki
/

abpt

Sleeping

auto: sync run_testformer_wikitext_combo_remote.py

f37be5a about 1 month ago

5.96 kB

	from dataclasses import dataclass


	@dataclass
	class ModelConfig:
	# Backbone
	vocab_size: int = 8192
	d_model: int = 256
	n_heads: int = 4
	n_layers: int = 6
	d_ff: int = 512
	max_seq_len: int = 512
	dropout: float = 0.1

	# Attention Residuals
	use_attn_res: bool = True

	# Branching
	use_branches: bool = True
	n_branches: int = 2
	diversity_weight: float = 0.1
	branch_diversity_target: float = 0.08

	# Verifier
	use_verifier: bool = True
	verifier_entropy_weight: float = 0.4
	verifier_agreement_weight: float = 0.4
	verifier_consistency_weight: float = 0.2
	verifier_temperature: float = 4.0

	# Plastic Layer
	use_plastic: bool = True
	plastic_lr: float = 1e-4
	plastic_decay: float = 0.99
	plastic_l2_weight: float = 0.01
	plastic_hidden: int = 64
	plastic_noise_scale: float = 0.05
	plastic_mask_ratio: float = 0.15
	plastic_train_updates: int = 1

	# Equilibrium / Routing (Phase 0)
	eq_momentum: float = 0.1
	eq_warmup_steps: int = 50
	router_lr: float = 3e-5
	router_warmup_steps: int = 500
	router_entropy_weight: float = 0.01
	route_temperature: float = 8.0
	route_threshold_momentum: float = 0.2
	route_threshold_offset_scale: float = 0.2
	route_forward_target: float = 0.55
	route_branch_target: float = 0.25
	route_backward_target: float = 0.15
	route_plastic_target: float = 0.05

	# Anchor V1
	use_fog_flow: bool = False
	fog_task_profile: str = "auto"
	fog_compare_ratio: float = 0.25
	fog_memory_ratio: float = 0.75
	fog_expand_ratio: float = 2.0
	fog_gate_ratio: float = 0.125
	anchor_prior_weight: float = 1.0
	anchor_runtime_weight: float = 1.0
	anchor_threshold: float = 0.65
	anchor_domain_mode: str = "auto"
	anchor_max_candidates: int = 6
	anchor_ttl_init: float = 4.0
	anchor_support_decay: float = 0.9
	anchor_candidate_promote_threshold: float = 0.55
	anchor_confirm_threshold: float = 0.7
	anchor_revision_threshold: float = 0.35
	anchor_contradiction_threshold: float = 0.65
	anchor_dead_end_threshold: float = 0.85
	anchor_arbiter_beta: float = 8.0
	anchor_arbiter_revise_threshold: float = 0.45
	anchor_revision_temperature: float = 1.0
	anchor_viability_alpha: float = 2.0
	anchor_viability_beta: float = 2.5
	anchor_age_gamma: float = 0.25
	anchor_descendant_mass_delta: float = 0.75
	anchor_descendant_coherence_eta: float = 0.75
	anchor_detector_alignment_weight: float = 0.05
	anchor_context_stability_weight: float = 0.01
	anchor_dependency_threshold: float = 0.55
	anchor_dependency_confirm_slope: float = 0.10
	anchor_dependency_temporal_window: float = 16.0
	anchor_dependency_similarity_weight: float = 0.55
	anchor_dependency_temporal_weight: float = 0.20
	anchor_dependency_support_weight: float = 0.15
	anchor_dependency_viability_weight: float = 0.10
	anchor_dependency_max_predecessors: int = 4
	anchor_dependency_counterfactual_top_edges: int = 0
	anchor_dependency_future_window: int = 16
	anchor_context_min_viability: float = 0.30
	anchor_use_future_proposal_head: bool = True
	anchor_future_proposal_trigger: float = 0.35
	anchor_future_proposal_hidden: int = 64
	anchor_future_proposal_threshold: float = 0.58
	anchor_future_proposal_temperature: float = 0.75
	anchor_future_proposal_horizon_scale: float = 4.0
	anchor_future_proposal_span_scale: float = 4.0
	anchor_future_proposal_max_horizon: int = 32
	anchor_future_proposal_max_windows: int = 48
	anchor_future_proposal_topk: int = 4
	anchor_future_proposal_residual_scale: float = 0.10
	anchor_proposal_score_weight: float = 0.05
	anchor_proposal_margin_weight: float = 0.05
	anchor_proposal_alignment_weight: float = 0.02
	anchor_proposal_counterfactual_weight: float = 0.05
	anchor_proposal_margin_target: float = 0.05
	anchor_proposal_target_temperature: float = 0.15
	anchor_proposal_counterfactual_margin: float = 0.02
	anchor_proposal_counterfactual_window: int = 4
	anchor_use_proposal_rollout: bool = True
	anchor_proposal_rollout_steps: int = 4
	anchor_proposal_rollout_hidden: int = 64
	anchor_proposal_rollout_weight: float = 0.05
	anchor_proposal_rollout_margin: float = 0.02
	anchor_proposal_rollout_residual_scale: float = 0.15
	anchor_proposal_rollout_pressure_trigger: float = 0.45
	anchor_proposal_rollout_score_trigger: float = 0.90

	# Training
	learning_rate: float = 3e-4
	weight_decay: float = 0.01
	warmup_steps: int = 100
	max_steps: int = 5000
	batch_size: int = 32
	eval_interval: int = 100
	gradient_clip: float = 1.0


	# Ablation presets
	BASELINE_0 = ModelConfig(
	use_attn_res=False, use_branches=False,
	use_verifier=False, use_plastic=False,
	)

	BASELINE_1_ATTNRES = ModelConfig(
	use_attn_res=True, use_branches=False,
	use_verifier=False, use_plastic=False,
	)

	BASELINE_2_BRANCHES = ModelConfig(
	use_attn_res=True, use_branches=True,
	use_verifier=True, use_plastic=False,
	)

	BASELINE_3_PLASTIC = ModelConfig(
	use_attn_res=True, use_branches=False,
	use_verifier=False, use_plastic=True,
	)

	FULL_MODEL = ModelConfig(
	use_attn_res=True, use_branches=True,
	use_verifier=True, use_plastic=True,
	)

	TOY_CONFIG = ModelConfig(
	vocab_size=512, d_model=64, n_heads=2,
	n_layers=3, d_ff=128, max_seq_len=128,
	plastic_hidden=16,
	)

	SCALEUP_CONFIG = ModelConfig(
	vocab_size=512, d_model=512, n_heads=8,
	n_layers=4, d_ff=1024, max_seq_len=128,
	plastic_hidden=128,
	anchor_threshold=0.2, anchor_ttl_init=4.0,
	anchor_dead_end_threshold=0.5
	)

	PRESETS = {
	"baseline-0": BASELINE_0,
	"baseline-1-attnres": BASELINE_1_ATTNRES,
	"baseline-2-branches": BASELINE_2_BRANCHES,
	"baseline-3-plastic": BASELINE_3_PLASTIC,
	"full": FULL_MODEL,
	"toy": TOY_CONFIG,
	"scaleup": SCALEUP_CONFIG,
	}