{ "return_dict": true, "output_hidden_states": false, "torchscript": false, "dtype": null, "pruned_heads": {}, "tie_word_embeddings": false, "chunk_size_feed_forward": 0, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "architectures": ["RecursiveMaskedLM"], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "task_specific_params": null, "problem_type": null, "tokenizer_class": null, "prefix": null, "bos_token_id": null, "pad_token_id": null, "eos_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "num_beam_groups": 1, "diversity_penalty": 0.0, "_name_or_path": "", "transformers_version": "4.57.0", "tf_legacy_loss": false, "use_bfloat16": false, "base_model_config": { "return_dict": true, "output_hidden_states": false, "torchscript": false, "dtype": "bfloat16", "pruned_heads": {}, "tie_word_embeddings": false, "chunk_size_feed_forward": 0, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "architectures": [ "LLaDAModelLM" ], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "task_specific_params": null, "problem_type": null, "tokenizer_class": null, "prefix": null, "bos_token_id": null, "pad_token_id": 76, "eos_token_id": 76, "sep_token_id": null, "decoder_start_token_id": null, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "num_beam_groups": 1, "diversity_penalty": 0.0, "_name_or_path": "Fraser/LLaDA-8B-Base-gg2m", "transformers_version": "4.57.0", "d_model": 4096, "n_heads": 32, "n_kv_heads": 32, "n_layers": 32, "mlp_ratio": 4, "mlp_hidden_size": 12288, "activation_type": "silu", "block_type": "llama", "block_group_size": 1, "alibi": false, "alibi_bias_max": 8.0, "rope": true, "rope_full_precision": true, "flash_attention": false, "attention_dropout": 0.0, "multi_query_attention": null, "attention_layer_norm": false, "residual_dropout": 0.0, "embedding_dropout": 0.0, "input_emb_norm": false, "layer_norm_type": "rms", "layer_norm_with_affine": true, "rms_norm_eps": 1e-05, "attention_layer_norm_with_affine": true, "max_sequence_length": 4096, "rope_theta": 500000.0, "include_qkv_bias": false, "include_bias": false, "bias_for_layer_norm": false, "scale_logits": false, "vocab_size": 85, "embedding_size": 85, "weight_tying": false, "mask_token_id": 78, "init_device": "meta", "init_fn": "mitchell", "init_std": 0.02, "init_cutoff_factor": null, "precision": "amp_bf16", "auto_map": { "AutoConfig": "configuration_llada.LLaDAConfig", "AutoModelForCausalLM": "modeling_llada.LLaDAModelLM", "AutoModel": "modeling_llada.LLaDAModelLM" }, "model_type": "llada", "use_cache": false, "tf_legacy_loss": false, "use_bfloat16": false, "output_attentions": false }, "num_recursions": 4, "normalization": "softmax", "loss_weight": "linear", "mask_token_id": 78, "gradient_steps": null, "schedule": "linear", "causal_strength": 1.0, "temperature_max": 0.0, "entropy_target_max": 0.0, "entropy_floor_max": 0.0, "smear_sigma_max": 0.0, "noise_std_max": 0.0, "iteration_rope_dim_fraction": 0.0, "use_recursion_checkpointing": true, "soft_embedding_method": "softmax", "soft_embedding_ema_step": 1.0, "flow_matching_enabled": false, "flow_matching_lambda": 0.5, "flow_matching_t_distribution": "logit_normal", "flow_matching_t_logit_mean": -0.4, "flow_matching_t_logit_std": 1.0, "flow_matching_t_min": 0.01, "flow_matching_t_max": 0.99, "flow_matching_noise_scale": 2.0, "flow_matching_mask_scale": false, "self_distillation_enabled": false, "self_distillation_lambda": 0.5, "self_distillation_temperature_min": 1.5, "self_distillation_temperature_max": 10.0, "self_distillation_temperature_distribution": "log_uniform", "self_distillation_teacher": "first", "model_type": "recursive-mlm", "output_attentions": false, "auto_map": { "AutoConfig": "configuration_recursive.RecursiveMLMConfig", "AutoModel": "modeling_recursive.RecursiveMaskedLM" } }