File size: 4,060 Bytes
3a49034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
{
  "model": {
    "d_model": 128,
    "n_layers": 4,
    "n_heads": 4,
    "d_ff": 512,
    "dropout": 0.1,
    "activation": "gelu",
    "max_seq_len": 64,
    "vocab_size": 2000,
    "pos_encoding_type": "rotary",
    "use_flash_attention": true,
    "norm_type": "rmsnorm",
    "norm_eps": 1e-06,
    "init_std": 0.02
  },
  "diffusion": {
    "n_timesteps": 200,
    "n_inference_steps": 20,
    "schedule_type": "cosine",
    "beta_start": 0.0001,
    "beta_end": 0.02,
    "prediction_type": "epsilon",
    "sampling_method": "ddim",
    "eta_ddim": 0.0,
    "clip_sample_max": 5.0,
    "clip_sample_min": -5.0,
    "loss_type": "mse",
    "loss_weighting": "min_snr",
    "p2_gamma": 1.0,
    "p2_k": 1.0
  },
  "graph_encoder": {
    "d_graph": 128,
    "n_graph_layers": 2,
    "n_graph_heads": 4,
    "max_evidence_nodes": 50,
    "max_compositions": 20,
    "max_anomalies": 10,
    "max_reasoning_steps": 15,
    "conditioning_method": "cross_attention",
    "embed_confidence": true,
    "embed_temporal": true
  },
  "tokenizer": {
    "bpe_vocab_size": 28000,
    "max_sentences": 32,
    "sentence_boundary_token": "<sent>",
    "pad_token": "<pad>",
    "bos_token": "<bos>",
    "eos_token": "<eos>",
    "mask_token": "<mask>",
    "noise_token": "<noise>",
    "evidence_token": "<evidence>",
    "anomaly_token": "<anomaly>",
    "confidence_token": "<confidence>",
    "reasoning_token": "<reasoning>",
    "composition_token": "<composition>",
    "temporal_token": "<temporal>",
    "min_frequency": 2,
    "dropout_rate": 0.0
  },
  "training": {
    "learning_rate": 0.0001,
    "weight_decay": 0.01,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "adam_eps": 1e-08,
    "lr_schedule": "cosine",
    "warmup_steps": 2000,
    "batch_size": 32,
    "gradient_accumulation_steps": 4,
    "max_steps": 500000,
    "max_epochs": 100,
    "dropout": 0.1,
    "grad_clip_norm": 1.0,
    "use_amp": true,
    "amp_dtype": "bf16",
    "save_every_steps": 5000,
    "eval_every_steps": 1000,
    "keep_last_n_checkpoints": 3,
    "use_ema": true,
    "ema_decay": 0.9999,
    "train_data_path": "",
    "val_data_path": "",
    "num_workers": 4,
    "log_every_steps": 100,
    "wandb_project": "aam-diffusion-llm",
    "wandb_run_name": ""
  },
  "inference": {
    "n_steps": 50,
    "temperature": 1.0,
    "top_k": 50,
    "top_p": 0.95,
    "repetition_penalty": 1.2,
    "max_output_sentences": 16,
    "language": "id"
  },
  "anchored_decoder": {
    "d_model": 128,
    "d_vocab": 2000,
    "n_refine_steps": 3,
    "d_refine": 64,
    "use_evoformer_feedback": true,
    "n_feedback_iterations": 2,
    "disambiguation_heads": 8
  },
  "flow_matching": {
    "d_model": 128,
    "d_vocab": 2000,
    "num_steps": 3
  },
  "evoformer": {
    "d_model": 128,
    "n_recycling_steps": 3,
    "dropout": 0.0,
    "use_layer_recycling": true,
    "use_token_recycling": true,
    "use_decoder_feedback": true,
    "use_prediction_recycling": true,
    "min_recycling_improvement": 0.0001
  },
  "dual_memory": {
    "d_model": 128,
    "working_memory_size": 512,
    "long_term_memory_dim": 64,
    "consolidation_method": "attention",
    "retrieval_method": "attention",
    "n_retrieval_heads": 4,
    "dropout": 0.0
  },
  "mcts": {
    "num_simulations": 4,
    "c_puct": 1.5,
    "temperature": 1.0,
    "max_depth": 10,
    "use_value_network": true,
    "max_children": 8
  },
  "thinking_toggle": {
    "d_model": 128,
    "threshold": 0.5
  },
  "matryoshka": {
    "d_model": 768,
    "d_ff": 3072,
    "granularity_factors": [
      0.25,
      0.5,
      0.75,
      1.0
    ],
    "matryoshka_loss_weight": 0.1,
    "use_adaptive": true
  },
  "use_anchored_decoder": true,
  "use_flow_matching": true,
  "use_evoformer": true,
  "use_dual_memory": true,
  "use_mcts": true,
  "use_thinking_toggle": true,
  "use_matryoshka": true,
  "use_swiglu_ffn": true,
  "model_name": "aam-diffusion-v2.0",
  "output_dir": "./output",
  "seed": 42,
  "aam_mind_source": "rsvs_graph",
  "aam_body_type": "specialized_diffusion"
}