BRlkl
/

test_multiturn_simple_transcript1024

Model card Files Files and versions

test_multiturn_simple_transcript1024 / sft_b200.yaml

BRlkl's picture

Upload folder using huggingface_hub

48b5767 verified 2 days ago

History Blame Contribute Delete

3.49 kB

	datasets:
	- name: chatalpaca_multiturn_enriched
	repo_id: BRlkl/chatalpaca-multiturn-enriched
	source_split: train
	format: messages_all_turns
	validation_ratio: 0.02
	split_seed: 17
	min_turns: 2
	max_turns: 6
	max_message_chars: 6000
	use_base_chat_template: true

	model:
	base_model_name: google/t5gemma-l-l-prefixlm-it
	initial_model_path: BRlkl/test_1024
	dtype: bfloat16
	attn_implementation: sdpa
	disable_cudnn_sdp: true
	disable_mha_fastpath: true
	magicnorm_eps: 1.0e-6
	z_slots: 1024
	num_time_tokens: 0
	use_explicit_time_features: false
	gate_attention_heads: 4
	max_observation_tokens: 1024
	max_decoder_tokens: 1024
	thought_loop_proposal_mode: observation_hidden_compression
	preserve_observation_encoder_manifold: true
	observation_encoder_use_state_context: true
	latent_attention_mask_mode: full
	hard_state_replace: true

	training:
	seed: 17
	num_workers: 2
	gradient_checkpointing: true
	mixed_precision: bf16
	max_grad_norm: 1.0
	weight_decay: 0.01
	backbone_learning_rate: 5.0e-6
	new_module_learning_rate: 1.0e-4
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_epsilon: 1.0e-8
	fused_adamw: true
	freeze_gate_head: true
	assistant_feedback_mode: teacher_forced
	log_every_steps: 1
	eval_every_steps: 100
	checkpoint_every_steps: 500
	eval_max_batches: 16
	validation_behavior_max_batches: 4
	max_train_examples:
	max_validation_examples:

	response_loss_weight: 0.33
	current_user_reconstruction_loss_weight: 0.33
	probe_loss_weight: 0.33
	probe_question_text: "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc"

	feedback_generation_max_new_tokens: 1024
	feedback_generation_extra_new_tokens: 16
	validation_response_max_new_tokens: 1024
	validation_response_extra_new_tokens: 16
	validation_probe_max_new_tokens: 1024
	validation_probe_extra_new_tokens: 16

	wandb_train_metric_keys:
	- train/loss_total
	- train/loss_response
	- train/loss_current_user_reconstruction
	- train/loss_probe
	- train/response_first_token_exact_match
	- train/current-user_reconstruction_first_token_exact_match
	- train/probe_first_token_exact_match
	wandb_validation_metric_keys:
	- validation/loss_total
	- validation/loss_response
	- validation/loss_current_user_reconstruction
	- validation/loss_probe
	- validation/goal_loss
	- validation/response_similarity
	- validation/response_reconstruction_similarity
	- validation/probe_transcript_similarity

	checkpoint_selection_metric: validation/goal_loss
	checkpoint_selection_mode: min
	validation_response_exact_miss_penalty: 1.0
	validation_reconstruction_similarity_miss_penalty: 1.0
	validation_probe_exact_miss_penalty: 1.0
	validation_probe_similarity_miss_penalty: 2.0

	phase:
	micro_batch_size: 10
	eval_batch_size: 10
	gradient_accumulation_steps: 4
	num_train_epochs: 5
	warmup_ratio: 0.03
	shuffle_train: true

	cache:
	preprocessed_root: cache/preprocessed_pre_sft_multiturn_simple_transcript

	paths:
	run_root: runs_pre_sft_multiturn_simple_transcript
	export_root: exports_multiturn_simple_transcript

	inference:
	format: predictive_state_multiturn
	use_base_chat_template: true

	wandb:
	enabled: true
	project: samantha-pre-sft
	run_name: t5gemma2-thoughtloop-pre-sft-simple-transcript

	hub:
	model_repo_id: BRlkl/test_multiturn_simple_transcript1024_2
	private: false