Instructions to use Fraser/LLaDA-8B-Recursive-ARC with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Fraser/LLaDA-8B-Recursive-ARC with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Fraser/LLaDA-8B-Recursive-ARC", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("Fraser/LLaDA-8B-Recursive-ARC", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Fraser/LLaDA-8B-Recursive-ARC with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Fraser/LLaDA-8B-Recursive-ARC"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Fraser/LLaDA-8B-Recursive-ARC",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Fraser/LLaDA-8B-Recursive-ARC

SGLang

How to use Fraser/LLaDA-8B-Recursive-ARC with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Fraser/LLaDA-8B-Recursive-ARC" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Fraser/LLaDA-8B-Recursive-ARC",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Fraser/LLaDA-8B-Recursive-ARC" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Fraser/LLaDA-8B-Recursive-ARC",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Fraser/LLaDA-8B-Recursive-ARC with Docker Model Runner:
```
docker model run hf.co/Fraser/LLaDA-8B-Recursive-ARC
```

Fraser commited on Feb 27

Commit

f98df9d

verified ·

1 Parent(s): 1c92536

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

config.json +170 -117
model.safetensors +1 -1
modeling_llada.py +5 -63
modeling_recursive.py +9 -10

config.json CHANGED Viewed

@@ -1,157 +1,210 @@
 {
-  "architectures": [
-    "RecursiveMaskedLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_recursive.RecursiveMLMConfig",
-    "AutoModel": "modeling_recursive.RecursiveMaskedLM"
   },
   "base_model_config": {
-    "_name_or_path": "Fraser/LLaDA-8B-Base-gg2m",
-    "activation_type": "silu",
     "add_cross_attention": false,
-    "alibi": false,
-    "alibi_bias_max": 8.0,
     "architectures": [
       "LLaDAModelLM"
     ],
-    "attention_dropout": 0.0,
-    "attention_layer_norm": false,
-    "attention_layer_norm_with_affine": true,
-    "auto_map": {
-      "AutoConfig": "configuration_llada.LLaDAConfig",
-      "AutoModel": "modeling_llada.LLaDAModelLM",
-      "AutoModelForCausalLM": "modeling_llada.LLaDAModelLM"
-    },
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bias_for_layer_norm": false,
-    "block_group_size": 1,
-    "block_type": "llama",
-    "bos_token_id": 75,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "d_model": 4096,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dtype": "bfloat16",
-    "early_stopping": false,
-    "embedding_dropout": 0.0,
-    "embedding_size": 85,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 76,
-    "exponential_decay_length_penalty": null,
     "finetuning_task": null,
-    "flash_attention": false,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
-    "include_bias": false,
-    "include_qkv_bias": false,
-    "init_cutoff_factor": null,
-    "init_device": "meta",
-    "init_fn": "mitchell",
-    "init_std": 0.02,
-    "input_emb_norm": false,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
     "label2id": {
       "LABEL_0": 0,
       "LABEL_1": 1
     },
-    "layer_norm_type": "rms",
-    "layer_norm_with_affine": true,
-    "length_penalty": 1.0,
-    "mask_token_id": 78,
     "max_length": 20,
-    "max_sequence_length": 4096,
     "min_length": 0,
-    "mlp_hidden_size": 12288,
-    "mlp_ratio": 4,
-    "model_type": "llada",
-    "multi_query_attention": null,
-    "n_heads": 32,
-    "n_kv_heads": 32,
-    "n_layers": 32,
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
     "num_beams": 1,
     "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
     "output_scores": false,
-    "pad_token_id": 76,
-    "precision": "amp_bf16",
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "residual_dropout": 0.0,
-    "return_dict": true,
     "return_dict_in_generate": false,
-    "rms_norm_eps": 1e-05,
     "rope": true,
     "rope_full_precision": true,
     "rope_theta": 500000.0,
     "scale_logits": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
     "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torchscript": false,
-    "transformers_version": "4.57.0",
-    "typical_p": 1.0,
     "use_bfloat16": false,
-    "use_cache": false,
-    "vocab_size": 85,
-    "weight_tying": false
   },
-  "bos_token_id": 75,
   "causal_strength": 1.0,
-  "dtype": "bfloat16",
-  "entropy_floor_max": 0.0,
   "entropy_target_max": 0.0,
-  "eos_token_id": 76,
   "flow_matching_enabled": false,
   "flow_matching_lambda": 0.5,
-  "flow_matching_mask_scale": false,
-  "flow_matching_noise_scale": 2.0,
   "flow_matching_t_distribution": "logit_normal",
   "flow_matching_t_logit_mean": -0.4,
   "flow_matching_t_logit_std": 1.0,
-  "flow_matching_t_max": 0.99,
   "flow_matching_t_min": 0.01,
-  "gradient_steps": null,
-  "iteration_rope_dim_fraction": 0.0,
-  "loss_weight": "linear",
-  "mask_token_id": 78,
-  "model_type": "recursive-mlm",
-  "noise_std_max": 0.0,
-  "normalization": "softmax",
-  "num_recursions": 4,
-  "pad_token_id": 76,
-  "schedule": "linear",
   "self_distillation_enabled": false,
   "self_distillation_lambda": 0.5,
-  "self_distillation_teacher": "first",
-  "self_distillation_temperature_distribution": "log_uniform",
-  "self_distillation_temperature_max": 10.0,
   "self_distillation_temperature_min": 1.5,
-  "smear_sigma_max": 0.0,
-  "soft_embedding_ema_step": 1.0,
-  "soft_embedding_method": "softmax",
-  "temperature_max": 0.0,
-  "transformers_version": "4.57.0",
-  "use_recursion_checkpointing": true
-}

 {
+  "return_dict": true,
+  "output_hidden_states": false,
+  "torchscript": false,
+  "dtype": null,
+  "pruned_heads": {},
+  "tie_word_embeddings": false,
+  "chunk_size_feed_forward": 0,
+  "is_encoder_decoder": false,
+  "is_decoder": false,
+  "cross_attention_hidden_size": null,
+  "add_cross_attention": false,
+  "tie_encoder_decoder": false,
+  "architectures": ["RecursiveMaskedLM"],
+  "finetuning_task": null,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
   },
+  "task_specific_params": null,
+  "problem_type": null,
+  "tokenizer_class": null,
+  "prefix": null,
+  "bos_token_id": null,
+  "pad_token_id": null,
+  "eos_token_id": null,
+  "sep_token_id": null,
+  "decoder_start_token_id": null,
+  "max_length": 20,
+  "min_length": 0,
+  "do_sample": false,
+  "early_stopping": false,
+  "num_beams": 1,
+  "temperature": 1.0,
+  "top_k": 50,
+  "top_p": 1.0,
+  "typical_p": 1.0,
+  "repetition_penalty": 1.0,
+  "length_penalty": 1.0,
+  "no_repeat_ngram_size": 0,
+  "encoder_no_repeat_ngram_size": 0,
+  "bad_words_ids": null,
+  "num_return_sequences": 1,
+  "output_scores": false,
+  "return_dict_in_generate": false,
+  "forced_bos_token_id": null,
+  "forced_eos_token_id": null,
+  "remove_invalid_values": false,
+  "exponential_decay_length_penalty": null,
+  "suppress_tokens": null,
+  "begin_suppress_tokens": null,
+  "num_beam_groups": 1,
+  "diversity_penalty": 0.0,
+  "_name_or_path": "",
+  "transformers_version": "4.57.0",
+  "tf_legacy_loss": false,
+  "use_bfloat16": false,
   "base_model_config": {
+    "return_dict": true,
+    "output_hidden_states": false,
+    "torchscript": false,
+    "dtype": "bfloat16",
+    "pruned_heads": {},
+    "tie_word_embeddings": false,
+    "chunk_size_feed_forward": 0,
+    "is_encoder_decoder": false,
+    "is_decoder": false,
+    "cross_attention_hidden_size": null,
     "add_cross_attention": false,
+    "tie_encoder_decoder": false,
     "architectures": [
       "LLaDAModelLM"
     ],
     "finetuning_task": null,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
     },
     "label2id": {
       "LABEL_0": 0,
       "LABEL_1": 1
     },
+    "task_specific_params": null,
+    "problem_type": null,
+    "tokenizer_class": null,
+    "prefix": null,
+    "bos_token_id": null,
+    "pad_token_id": 76,
+    "eos_token_id": 76,
+    "sep_token_id": null,
+    "decoder_start_token_id": null,
     "max_length": 20,
     "min_length": 0,
+    "do_sample": false,
+    "early_stopping": false,
     "num_beams": 1,
+    "temperature": 1.0,
+    "top_k": 50,
+    "top_p": 1.0,
+    "typical_p": 1.0,
+    "repetition_penalty": 1.0,
+    "length_penalty": 1.0,
+    "no_repeat_ngram_size": 0,
+    "encoder_no_repeat_ngram_size": 0,
+    "bad_words_ids": null,
     "num_return_sequences": 1,
     "output_scores": false,
     "return_dict_in_generate": false,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "remove_invalid_values": false,
+    "exponential_decay_length_penalty": null,
+    "suppress_tokens": null,
+    "begin_suppress_tokens": null,
+    "num_beam_groups": 1,
+    "diversity_penalty": 0.0,
+    "_name_or_path": "Fraser/LLaDA-8B-Base-gg2m",
+    "transformers_version": "4.57.0",
+    "d_model": 4096,
+    "n_heads": 32,
+    "n_kv_heads": 32,
+    "n_layers": 32,
+    "mlp_ratio": 4,
+    "mlp_hidden_size": 12288,
+    "activation_type": "silu",
+    "block_type": "llama",
+    "block_group_size": 1,
+    "alibi": false,
+    "alibi_bias_max": 8.0,
     "rope": true,
     "rope_full_precision": true,
+    "flash_attention": false,
+    "attention_dropout": 0.0,
+    "multi_query_attention": null,
+    "attention_layer_norm": false,
+    "residual_dropout": 0.0,
+    "embedding_dropout": 0.0,
+    "input_emb_norm": false,
+    "layer_norm_type": "rms",
+    "layer_norm_with_affine": true,
+    "rms_norm_eps": 1e-05,
+    "attention_layer_norm_with_affine": true,
+    "max_sequence_length": 4096,
     "rope_theta": 500000.0,
+    "include_qkv_bias": false,
+    "include_bias": false,
+    "bias_for_layer_norm": false,
     "scale_logits": false,
+    "vocab_size": 85,
+    "embedding_size": 85,
+    "weight_tying": false,
+    "mask_token_id": 78,
+    "init_device": "meta",
+    "init_fn": "mitchell",
+    "init_std": 0.02,
+    "init_cutoff_factor": null,
+    "precision": "amp_bf16",
+    "auto_map": {
+      "AutoConfig": "configuration_llada.LLaDAConfig",
+      "AutoModelForCausalLM": "modeling_llada.LLaDAModelLM",
+      "AutoModel": "modeling_llada.LLaDAModelLM"
+    },
+    "model_type": "llada",
+    "use_cache": false,
     "tf_legacy_loss": false,
     "use_bfloat16": false,
+    "output_attentions": false
   },
+  "num_recursions": 4,
+  "normalization": "softmax",
+  "loss_weight": "linear",
+  "mask_token_id": 78,
+  "gradient_steps": null,
+  "schedule": "linear",
   "causal_strength": 1.0,
+  "temperature_max": 0.0,
   "entropy_target_max": 0.0,
+  "entropy_floor_max": 0.0,
+  "smear_sigma_max": 0.0,
+  "noise_std_max": 0.0,
+  "iteration_rope_dim_fraction": 0.0,
+  "use_recursion_checkpointing": true,
+  "soft_embedding_method": "softmax",
+  "soft_embedding_ema_step": 1.0,
   "flow_matching_enabled": false,
   "flow_matching_lambda": 0.5,
   "flow_matching_t_distribution": "logit_normal",
   "flow_matching_t_logit_mean": -0.4,
   "flow_matching_t_logit_std": 1.0,
   "flow_matching_t_min": 0.01,
+  "flow_matching_t_max": 0.99,
+  "flow_matching_noise_scale": 2.0,
+  "flow_matching_mask_scale": false,
   "self_distillation_enabled": false,
   "self_distillation_lambda": 0.5,
   "self_distillation_temperature_min": 1.5,
+  "self_distillation_temperature_max": 10.0,
+  "self_distillation_temperature_distribution": "log_uniform",
+  "self_distillation_teacher": "first",
+  "model_type": "recursive-mlm",
+  "output_attentions": false,
+  "auto_map": {
+    "AutoConfig": "configuration_recursive.RecursiveMLMConfig",
+    "AutoModel": "modeling_recursive.RecursiveMaskedLM"
+  }
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b879d9aaefbb88ca28a3babf863b2ab1ec1ef00bbf82ef7f3d7ddf7284dd968
 size 13960604928

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ebfe2c41adc724c0a164e5ea1efbaabc4c89a0da6c51e74e4d51502218219e0
 size 13960604928

modeling_llada.py CHANGED Viewed

@@ -1094,68 +1094,10 @@ class LLaDABlockGroup(nn.ModuleList):
             block.set_activation_checkpointing(strategy)
-class LLaDAPreTrainedModel(PreTrainedModel):
-    """
-    Minimal HF-compatible base to enable gradient checkpointing hooks and centralize
-    parameter initialization.
-    """
-    config_class = LLaDAConfig
-    base_model_prefix = "model"
-    _no_split_modules = ["LLaDALlamaBlock"]
-    _supports_gradient_checkpointing = True  # backward compat
-    supports_gradient_checkpointing = True   # transformers >=4.38
-    def __init__(self, config, *model_args, **model_kwargs):
-        hf_config = config
-        if not hasattr(hf_config, "to_dict"):
-            hf_config = LLaDAConfig(**config.__dict__)
-        super().__init__(hf_config, *model_args, **model_kwargs)
-    def _init_weights(self, module):
-        if getattr(module, "_llada_params_initialized", False):
-            return
-        if hasattr(module, "reset_parameters"):
-            module.reset_parameters()
-            for child in module.modules():
-                setattr(child, "_llada_params_initialized", True)
-    def _set_gradient_checkpointing(
-        self, enable: bool = True, gradient_checkpointing_func: Callable = None
-    ):
-        """
-        New-format hook expected by `PreTrainedModel.gradient_checkpointing_enable`.
-        Only LLaDAModel (the heavy transformer) actually toggles checkpointing.
-        """
-        from torch.utils.checkpoint import checkpoint
-        if gradient_checkpointing_func is None:
-            gradient_checkpointing_func = checkpoint
-        # When called on the HF wrapper (LLaDAModelLM), reach into the inner LLaDAModel.
-        target = self.model if isinstance(self, LLaDAModelLM) else self
-        if isinstance(target, LLaDAModel):
-            target._gradient_checkpointing_func = gradient_checkpointing_func
-            target.gradient_checkpointing = enable
-            strategy = ActivationCheckpointingStrategy.whole_layer if enable else None
-            target.set_activation_checkpointing(strategy)
-            return
-        # Fallback: walk modules to find the core model.
-        for module in self.modules():
-            if isinstance(module, LLaDAModel):
-                module._gradient_checkpointing_func = gradient_checkpointing_func
-                module.gradient_checkpointing = enable
-                strategy = ActivationCheckpointingStrategy.whole_layer if enable else None
-                module.set_activation_checkpointing(strategy)
-                break
-class LLaDAModel(LLaDAPreTrainedModel):
     def __init__(self, config: ModelConfig, init_params: bool = True):
-        super().__init__(config)
-        self.gradient_checkpointing: bool = False
         self.__cache = BufferCache()
         # Validate config.
@@ -1224,7 +1166,7 @@ class LLaDAModel(LLaDAPreTrainedModel):
             )
         # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
         if init_params and self.config.init_device != "meta":
-            self.post_init()
         self.__num_fwd_flops: Optional[int] = None
         # Warm up cache.
@@ -1513,7 +1455,7 @@ def create_model_config_from_pretrained_config(config: LLaDAConfig):
     return model_config
-class LLaDAModelLM(LLaDAPreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """

             block.set_activation_checkpointing(strategy)
+class LLaDAModel(nn.Module):
     def __init__(self, config: ModelConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
         self.__cache = BufferCache()
         # Validate config.
             )
         # When `init_device="meta"` FSDP will call `reset_parameters()` to initialize weights.
         if init_params and self.config.init_device != "meta":
+            self.reset_parameters()
         self.__num_fwd_flops: Optional[int] = None
         # Warm up cache.
     return model_config
+class LLaDAModelLM(PreTrainedModel):
     """
     Extremely barebones HF model wrapper.
     """

modeling_recursive.py CHANGED Viewed

@@ -13,14 +13,6 @@ from transformers.utils import ModelOutput
 from .configuration_recursive import RecursiveMLMConfig
-# Register the custom LLaDA model so AutoConfig.for_model("llada") works
-# when constructing the base model from base_model_config.
-from .configuration_llada import LLaDAConfig
-from .modeling_llada import LLaDAModelLM
-AutoConfig.register("llada", LLaDAConfig)
-AutoModelForMaskedLM.register(LLaDAConfig, LLaDAModelLM)
 @dataclass
 class IterationMetrics(ModelOutput):
@@ -75,8 +67,15 @@ class RecursiveMaskedLM(PreTrainedModel):
             # to avoid reinitializing the pre-trained weights via _init_weights()
             self.mlm = base_model
         elif config.base_model_config is not None:
-            base_config = AutoConfig.for_model(**config.base_model_config)
-            self.mlm = AutoModelForMaskedLM.from_config(base_config)
             # Only call post_init() for freshly created models (needs weight init)
             self.post_init()
         else:

 from .configuration_recursive import RecursiveMLMConfig
 @dataclass
 class IterationMetrics(ModelOutput):
             # to avoid reinitializing the pre-trained weights via _init_weights()
             self.mlm = base_model
         elif config.base_model_config is not None:
+            model_type = config.base_model_config.get("model_type", "")
+            if model_type == "llada":
+                from .configuration_llada import LLaDAConfig
+                from .modeling_llada import LLaDAModelLM
+                base_config = LLaDAConfig.from_dict(config.base_model_config)
+                self.mlm = LLaDAModelLM(base_config)
+            else:
+                base_config = AutoConfig.for_model(**config.base_model_config)
+                self.mlm = AutoModelForMaskedLM.from_config(base_config)
             # Only call post_init() for freshly created models (needs weight init)
             self.post_init()
         else: