aam-diffusion-v2 / config.json
Wolfvin's picture
Upload config.json with huggingface_hub
3a49034 verified
{
"model": {
"d_model": 128,
"n_layers": 4,
"n_heads": 4,
"d_ff": 512,
"dropout": 0.1,
"activation": "gelu",
"max_seq_len": 64,
"vocab_size": 2000,
"pos_encoding_type": "rotary",
"use_flash_attention": true,
"norm_type": "rmsnorm",
"norm_eps": 1e-06,
"init_std": 0.02
},
"diffusion": {
"n_timesteps": 200,
"n_inference_steps": 20,
"schedule_type": "cosine",
"beta_start": 0.0001,
"beta_end": 0.02,
"prediction_type": "epsilon",
"sampling_method": "ddim",
"eta_ddim": 0.0,
"clip_sample_max": 5.0,
"clip_sample_min": -5.0,
"loss_type": "mse",
"loss_weighting": "min_snr",
"p2_gamma": 1.0,
"p2_k": 1.0
},
"graph_encoder": {
"d_graph": 128,
"n_graph_layers": 2,
"n_graph_heads": 4,
"max_evidence_nodes": 50,
"max_compositions": 20,
"max_anomalies": 10,
"max_reasoning_steps": 15,
"conditioning_method": "cross_attention",
"embed_confidence": true,
"embed_temporal": true
},
"tokenizer": {
"bpe_vocab_size": 28000,
"max_sentences": 32,
"sentence_boundary_token": "<sent>",
"pad_token": "<pad>",
"bos_token": "<bos>",
"eos_token": "<eos>",
"mask_token": "<mask>",
"noise_token": "<noise>",
"evidence_token": "<evidence>",
"anomaly_token": "<anomaly>",
"confidence_token": "<confidence>",
"reasoning_token": "<reasoning>",
"composition_token": "<composition>",
"temporal_token": "<temporal>",
"min_frequency": 2,
"dropout_rate": 0.0
},
"training": {
"learning_rate": 0.0001,
"weight_decay": 0.01,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_eps": 1e-08,
"lr_schedule": "cosine",
"warmup_steps": 2000,
"batch_size": 32,
"gradient_accumulation_steps": 4,
"max_steps": 500000,
"max_epochs": 100,
"dropout": 0.1,
"grad_clip_norm": 1.0,
"use_amp": true,
"amp_dtype": "bf16",
"save_every_steps": 5000,
"eval_every_steps": 1000,
"keep_last_n_checkpoints": 3,
"use_ema": true,
"ema_decay": 0.9999,
"train_data_path": "",
"val_data_path": "",
"num_workers": 4,
"log_every_steps": 100,
"wandb_project": "aam-diffusion-llm",
"wandb_run_name": ""
},
"inference": {
"n_steps": 50,
"temperature": 1.0,
"top_k": 50,
"top_p": 0.95,
"repetition_penalty": 1.2,
"max_output_sentences": 16,
"language": "id"
},
"anchored_decoder": {
"d_model": 128,
"d_vocab": 2000,
"n_refine_steps": 3,
"d_refine": 64,
"use_evoformer_feedback": true,
"n_feedback_iterations": 2,
"disambiguation_heads": 8
},
"flow_matching": {
"d_model": 128,
"d_vocab": 2000,
"num_steps": 3
},
"evoformer": {
"d_model": 128,
"n_recycling_steps": 3,
"dropout": 0.0,
"use_layer_recycling": true,
"use_token_recycling": true,
"use_decoder_feedback": true,
"use_prediction_recycling": true,
"min_recycling_improvement": 0.0001
},
"dual_memory": {
"d_model": 128,
"working_memory_size": 512,
"long_term_memory_dim": 64,
"consolidation_method": "attention",
"retrieval_method": "attention",
"n_retrieval_heads": 4,
"dropout": 0.0
},
"mcts": {
"num_simulations": 4,
"c_puct": 1.5,
"temperature": 1.0,
"max_depth": 10,
"use_value_network": true,
"max_children": 8
},
"thinking_toggle": {
"d_model": 128,
"threshold": 0.5
},
"matryoshka": {
"d_model": 768,
"d_ff": 3072,
"granularity_factors": [
0.25,
0.5,
0.75,
1.0
],
"matryoshka_loss_weight": 0.1,
"use_adaptive": true
},
"use_anchored_decoder": true,
"use_flow_matching": true,
"use_evoformer": true,
"use_dual_memory": true,
"use_mcts": true,
"use_thinking_toggle": true,
"use_matryoshka": true,
"use_swiglu_ffn": true,
"model_name": "aam-diffusion-v2.0",
"output_dir": "./output",
"seed": 42,
"aam_mind_source": "rsvs_graph",
"aam_body_type": "specialized_diffusion"
}