{ "model": { "d_model": 128, "n_layers": 4, "n_heads": 4, "d_ff": 512, "dropout": 0.1, "activation": "gelu", "max_seq_len": 64, "vocab_size": 2000, "pos_encoding_type": "rotary", "use_flash_attention": true, "norm_type": "rmsnorm", "norm_eps": 1e-06, "init_std": 0.02 }, "diffusion": { "n_timesteps": 200, "n_inference_steps": 20, "schedule_type": "cosine", "beta_start": 0.0001, "beta_end": 0.02, "prediction_type": "epsilon", "sampling_method": "ddim", "eta_ddim": 0.0, "clip_sample_max": 5.0, "clip_sample_min": -5.0, "loss_type": "mse", "loss_weighting": "min_snr", "p2_gamma": 1.0, "p2_k": 1.0 }, "graph_encoder": { "d_graph": 128, "n_graph_layers": 2, "n_graph_heads": 4, "max_evidence_nodes": 50, "max_compositions": 20, "max_anomalies": 10, "max_reasoning_steps": 15, "conditioning_method": "cross_attention", "embed_confidence": true, "embed_temporal": true }, "tokenizer": { "bpe_vocab_size": 28000, "max_sentences": 32, "sentence_boundary_token": "", "pad_token": "", "bos_token": "", "eos_token": "", "mask_token": "", "noise_token": "", "evidence_token": "", "anomaly_token": "", "confidence_token": "", "reasoning_token": "", "composition_token": "", "temporal_token": "", "min_frequency": 2, "dropout_rate": 0.0 }, "training": { "learning_rate": 0.0001, "weight_decay": 0.01, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_eps": 1e-08, "lr_schedule": "cosine", "warmup_steps": 2000, "batch_size": 32, "gradient_accumulation_steps": 4, "max_steps": 500000, "max_epochs": 100, "dropout": 0.1, "grad_clip_norm": 1.0, "use_amp": true, "amp_dtype": "bf16", "save_every_steps": 5000, "eval_every_steps": 1000, "keep_last_n_checkpoints": 3, "use_ema": true, "ema_decay": 0.9999, "train_data_path": "", "val_data_path": "", "num_workers": 4, "log_every_steps": 100, "wandb_project": "aam-diffusion-llm", "wandb_run_name": "" }, "inference": { "n_steps": 50, "temperature": 1.0, "top_k": 50, "top_p": 0.95, "repetition_penalty": 1.2, "max_output_sentences": 16, "language": "id" }, "anchored_decoder": { "d_model": 128, "d_vocab": 2000, "n_refine_steps": 3, "d_refine": 64, "use_evoformer_feedback": true, "n_feedback_iterations": 2, "disambiguation_heads": 8 }, "flow_matching": { "d_model": 128, "d_vocab": 2000, "num_steps": 3 }, "evoformer": { "d_model": 128, "n_recycling_steps": 3, "dropout": 0.0, "use_layer_recycling": true, "use_token_recycling": true, "use_decoder_feedback": true, "use_prediction_recycling": true, "min_recycling_improvement": 0.0001 }, "dual_memory": { "d_model": 128, "working_memory_size": 512, "long_term_memory_dim": 64, "consolidation_method": "attention", "retrieval_method": "attention", "n_retrieval_heads": 4, "dropout": 0.0 }, "mcts": { "num_simulations": 4, "c_puct": 1.5, "temperature": 1.0, "max_depth": 10, "use_value_network": true, "max_children": 8 }, "thinking_toggle": { "d_model": 128, "threshold": 0.5 }, "matryoshka": { "d_model": 768, "d_ff": 3072, "granularity_factors": [ 0.25, 0.5, 0.75, 1.0 ], "matryoshka_loss_weight": 0.1, "use_adaptive": true }, "use_anchored_decoder": true, "use_flow_matching": true, "use_evoformer": true, "use_dual_memory": true, "use_mcts": true, "use_thinking_toggle": true, "use_matryoshka": true, "use_swiglu_ffn": true, "model_name": "aam-diffusion-v2.0", "output_dir": "./output", "seed": 42, "aam_mind_source": "rsvs_graph", "aam_body_type": "specialized_diffusion" }