config.json · Wolfvin/aam-diffusion-v2 at main

Upload config.json with huggingface_hub

3a49034 verified about 22 hours ago

4.06 kB

	{
	"model": {
	"d_model": 128,
	"n_layers": 4,
	"n_heads": 4,
	"d_ff": 512,
	"dropout": 0.1,
	"activation": "gelu",
	"max_seq_len": 64,
	"vocab_size": 2000,
	"pos_encoding_type": "rotary",
	"use_flash_attention": true,
	"norm_type": "rmsnorm",
	"norm_eps": 1e-06,
	"init_std": 0.02
	},
	"diffusion": {
	"n_timesteps": 200,
	"n_inference_steps": 20,
	"schedule_type": "cosine",
	"beta_start": 0.0001,
	"beta_end": 0.02,
	"prediction_type": "epsilon",
	"sampling_method": "ddim",
	"eta_ddim": 0.0,
	"clip_sample_max": 5.0,
	"clip_sample_min": -5.0,
	"loss_type": "mse",
	"loss_weighting": "min_snr",
	"p2_gamma": 1.0,
	"p2_k": 1.0
	},
	"graph_encoder": {
	"d_graph": 128,
	"n_graph_layers": 2,
	"n_graph_heads": 4,
	"max_evidence_nodes": 50,
	"max_compositions": 20,
	"max_anomalies": 10,
	"max_reasoning_steps": 15,
	"conditioning_method": "cross_attention",
	"embed_confidence": true,
	"embed_temporal": true
	},
	"tokenizer": {
	"bpe_vocab_size": 28000,
	"max_sentences": 32,
	"sentence_boundary_token": "<sent>",
	"pad_token": "<pad>",
	"bos_token": "<bos>",
	"eos_token": "<eos>",
	"mask_token": "<mask>",
	"noise_token": "<noise>",
	"evidence_token": "<evidence>",
	"anomaly_token": "<anomaly>",
	"confidence_token": "<confidence>",
	"reasoning_token": "<reasoning>",
	"composition_token": "<composition>",
	"temporal_token": "<temporal>",
	"min_frequency": 2,
	"dropout_rate": 0.0
	},
	"training": {
	"learning_rate": 0.0001,
	"weight_decay": 0.01,
	"adam_beta1": 0.9,
	"adam_beta2": 0.999,
	"adam_eps": 1e-08,
	"lr_schedule": "cosine",
	"warmup_steps": 2000,
	"batch_size": 32,
	"gradient_accumulation_steps": 4,
	"max_steps": 500000,
	"max_epochs": 100,
	"dropout": 0.1,
	"grad_clip_norm": 1.0,
	"use_amp": true,
	"amp_dtype": "bf16",
	"save_every_steps": 5000,
	"eval_every_steps": 1000,
	"keep_last_n_checkpoints": 3,
	"use_ema": true,
	"ema_decay": 0.9999,
	"train_data_path": "",
	"val_data_path": "",
	"num_workers": 4,
	"log_every_steps": 100,
	"wandb_project": "aam-diffusion-llm",
	"wandb_run_name": ""
	},
	"inference": {
	"n_steps": 50,
	"temperature": 1.0,
	"top_k": 50,
	"top_p": 0.95,
	"repetition_penalty": 1.2,
	"max_output_sentences": 16,
	"language": "id"
	},
	"anchored_decoder": {
	"d_model": 128,
	"d_vocab": 2000,
	"n_refine_steps": 3,
	"d_refine": 64,
	"use_evoformer_feedback": true,
	"n_feedback_iterations": 2,
	"disambiguation_heads": 8
	},
	"flow_matching": {
	"d_model": 128,
	"d_vocab": 2000,
	"num_steps": 3
	},
	"evoformer": {
	"d_model": 128,
	"n_recycling_steps": 3,
	"dropout": 0.0,
	"use_layer_recycling": true,
	"use_token_recycling": true,
	"use_decoder_feedback": true,
	"use_prediction_recycling": true,
	"min_recycling_improvement": 0.0001
	},
	"dual_memory": {
	"d_model": 128,
	"working_memory_size": 512,
	"long_term_memory_dim": 64,
	"consolidation_method": "attention",
	"retrieval_method": "attention",
	"n_retrieval_heads": 4,
	"dropout": 0.0
	},
	"mcts": {
	"num_simulations": 4,
	"c_puct": 1.5,
	"temperature": 1.0,
	"max_depth": 10,
	"use_value_network": true,
	"max_children": 8
	},
	"thinking_toggle": {
	"d_model": 128,
	"threshold": 0.5
	},
	"matryoshka": {
	"d_model": 768,
	"d_ff": 3072,
	"granularity_factors": [
	0.25,
	0.5,
	0.75,
	1.0
	],
	"matryoshka_loss_weight": 0.1,
	"use_adaptive": true
	},
	"use_anchored_decoder": true,
	"use_flow_matching": true,
	"use_evoformer": true,
	"use_dual_memory": true,
	"use_mcts": true,
	"use_thinking_toggle": true,
	"use_matryoshka": true,
	"use_swiglu_ffn": true,
	"model_name": "aam-diffusion-v2.0",
	"output_dir": "./output",
	"seed": 42,
	"aam_mind_source": "rsvs_graph",
	"aam_body_type": "specialized_diffusion"
	}