BRlkl
/

test

Model card Files Files and versions

test / sft_config.json

BRlkl's picture

Upload folder using huggingface_hub

5232381 verified 17 days ago

History Blame Contribute Delete

3 kB

	{
	"datasets": [
	{
	"name": "alpaca",
	"repo_id": "tatsu-lab/alpaca",
	"format": "alpaca_single_turn",
	"source_split": "train",
	"validation_ratio": 0.02,
	"split_seed": 17,
	"use_base_chat_template": true
	}
	],
	"model": {
	"base_model_name": "google/t5gemma-l-l-prefixlm-it",
	"dtype": "bfloat16",
	"attn_implementation": "sdpa",
	"magicnorm_eps": 1e-06,
	"z_slots": 256,
	"num_time_tokens": 0,
	"use_explicit_time_features": false,
	"gate_attention_heads": 4,
	"max_observation_tokens": 1024,
	"max_decoder_tokens": 1024,
	"thought_loop_proposal_mode": "observation_hidden_compression",
	"preserve_observation_encoder_manifold": true,
	"observation_encoder_use_state_context": true,
	"latent_attention_mask_mode": "full",
	"initial_update_gate_bias": 6.0
	},
	"training": {
	"seed": 17,
	"num_workers": 4,
	"gradient_checkpointing": true,
	"mixed_precision": "bf16",
	"max_grad_norm": 1.0,
	"weight_decay": 0.01,
	"backbone_learning_rate": 1e-05,
	"new_module_learning_rate": 0.0003,
	"adam_beta1": 0.9,
	"adam_beta2": 0.95,
	"adam_epsilon": 1e-08,
	"fused_adamw": true,
	"freeze_gate_head": true,
	"log_every_steps": 1,
	"eval_every_steps": 100,
	"checkpoint_every_steps": 1000,
	"max_train_examples": null,
	"max_validation_examples": null,
	"latent_bridge_mode": "thought_loop",
	"decoder_training_mode": "teacher_forced",
	"freeze_decoder_first_fraction": 1.0,
	"probe_extra_new_tokens": 16,
	"identity_control_target_slots": 256,
	"identity_control_use_chat_template": false
	},
	"phases": {
	"phase0a": {
	"enabled": false,
	"micro_batch_size": 32,
	"eval_batch_size": 32,
	"gradient_accumulation_steps": 2,
	"num_train_epochs": 1,
	"warmup_ratio": 0.03,
	"shuffle_train": true,
	"instruction_prefix": "Recover the original text from the corrupted text through the latent bottleneck.",
	"corruption": {
	"span_mask_fraction": 0.22,
	"min_span_words": 1,
	"max_span_words": 8,
	"max_mask_spans": 6,
	"word_dropout_prob": 0.12,
	"placeholder_text": "[blank]"
	}
	},
	"phase0b": {
	"enabled": true,
	"micro_batch_size": 32,
	"eval_batch_size": 32,
	"gradient_accumulation_steps": 4,
	"num_train_epochs": 1,
	"warmup_ratio": 0.03,
	"shuffle_train": true,
	"instruction_prefix": "You are a helpful assistant. Reply to the user with the best of your capabilities."
	}
	},
	"cache": {
	"preprocessed_root": "cache/preprocessed_pre_sft"
	},
	"paths": {
	"run_root": "runs_pre_sft",
	"export_root": "exports"
	},
	"inference": {
	"format": "alpaca_single_turn"
	},
	"wandb": {
	"enabled": true,
	"project": "samantha-pre-sft",
	"run_name": "t5gemma2-thoughtloop-pre-sft-alpaca"
	},
	"hub": {
	"model_repo_id": "BRlkl/test",
	"private": false
	}
	}