# Copyright 2020-2026 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This script generates tiny models used in the TRL library for unit tests. It pushes them to the Hub under the # `trl-internal-testing` organization. # This script is meant to be run when adding new tiny model to the TRL library. import torch from huggingface_hub import HfApi, ModelCard from peft import LoraConfig, get_peft_model from torch import nn from transformers import ( AutoConfig, AutoProcessor, AutoTokenizer, BartModel, Cohere2Config, Cohere2ForCausalLM, CohereConfig, CohereForCausalLM, DeepseekV3Config, DeepseekV3ForCausalLM, FalconMambaConfig, FalconMambaForCausalLM, Gemma2Config, Gemma2ForCausalLM, Gemma3ForConditionalGeneration, Gemma4ForConditionalGeneration, GemmaConfig, GemmaForCausalLM, GenerationConfig, Glm4MoeConfig, Glm4MoeForCausalLM, GPT2Config, GPT2LMHeadModel, GPTNeoXConfig, GPTNeoXForCausalLM, GPTNeoXForSequenceClassification, GptOssConfig, GptOssForCausalLM, Idefics2Config, Idefics2ForConditionalGeneration, Idefics3ForConditionalGeneration, InternVLForConditionalGeneration, LlamaConfig, LlamaForCausalLM, LlamaForSequenceClassification, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, MistralConfig, MistralForCausalLM, OPTConfig, OPTForCausalLM, PaliGemmaForConditionalGeneration, Phi3Config, Phi3ForCausalLM, Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Config, Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2VLConfig, Qwen2VLForConditionalGeneration, Qwen3_5Config, Qwen3_5ForConditionalGeneration, Qwen3Config, Qwen3ForCausalLM, Qwen3ForSequenceClassification, Qwen3MoeConfig, Qwen3MoeForCausalLM, Qwen3MoeForSequenceClassification, Qwen3VLConfig, Qwen3VLForConditionalGeneration, SmolVLMForConditionalGeneration, T5ForConditionalGeneration, ) ORGANIZATION = "trl-internal-testing" MODEL_CARD = """ --- library_name: transformers tags: [trl] --- # Tiny {model_class_name} This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library. """ api = HfApi() def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, force=False): model_class_name = model.__class__.__name__ content = MODEL_CARD.format(model_class_name=model_class_name) model_card = ModelCard(content) if prefix is not None: model_class_name = f"{prefix}-{model_class_name}" repo_id = f"{ORGANIZATION}/{model_class_name}" if suffix is not None: repo_id += f"-{suffix}" if api.repo_exists(repo_id) and not force: print(f"Model {repo_id} already exists, skipping") else: model.push_to_hub(repo_id) model_card.push_to_hub(repo_id) if tokenizer is not None: tokenizer.push_to_hub(repo_id) if generation_config is not None: generation_config.push_to_hub(repo_id) def init_weights_tiny_model(model): """ Initialize tiny test models to avoid NaNs from uninitialized weights. Uses safe defaults: - Linear/Conv1d: Xavier uniform (weights), zero (biases) - Embedding: Normal(0, 0.02) - LayerNorm: Ones (weights), zero (biases) Args: model: PyTorch model (modified in-place) """ for module in model.modules(): if isinstance(module, nn.Linear): # Attention/MLP projections → Xavier or Normal if module.bias is not None: nn.init.zeros_(module.bias) nn.init.xavier_uniform_(module.weight) elif isinstance(module, nn.Embedding): # Token embeddings → GPT-style Normal nn.init.normal_(module.weight, mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): # LayerNorm weights always 1, bias 0 nn.init.ones_(module.weight) if module.bias is not None: nn.init.zeros_(module.bias) elif isinstance(module, nn.Conv1d): # Convolutional layers → Xavier or Normal if module.bias is not None: nn.init.zeros_(module.bias) nn.init.xavier_uniform_(module.weight) # Decoder models for model_id, config_class, model_class, dtype, suffix in [ # ("bigscience/bloomz-560m", BloomConfig, BloomForCausalLM, None), # loading fails with this model, see https://huggingface.co/bigscience/bloomz-560m/discussions/14 ("CohereLabs/aya-expanse-8b", CohereConfig, CohereForCausalLM, torch.float16, None), ("CohereLabs/tiny-aya-earth", Cohere2Config, Cohere2ForCausalLM, torch.bfloat16, None), ("deepseek-ai/DeepSeek-R1", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, None), # It's important to have R1-0528 as it doesn't have the same chat template ("deepseek-ai/DeepSeek-R1-0528", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, "0528"), ("tiiuae/falcon-7b-instruct", FalconMambaConfig, FalconMambaForCausalLM, torch.bfloat16, None), ("google/gemma-2-2b-it", Gemma2Config, Gemma2ForCausalLM, torch.bfloat16, None), ("google/gemma-7b-it", GemmaConfig, GemmaForCausalLM, torch.bfloat16, None), ("openai-community/gpt2", GPT2Config, GPT2LMHeadModel, torch.float32, None), ("EleutherAI/pythia-14m", GPTNeoXConfig, GPTNeoXForCausalLM, torch.float16, None), ("meta-llama/Meta-Llama-3-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3"), ("meta-llama/Llama-3.1-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.1"), ("meta-llama/Llama-3.2-1B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.2"), ("mistralai/Mistral-7B-Instruct-v0.1", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.1"), ("mistralai/Mistral-7B-Instruct-v0.2", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.2"), ("facebook/opt-1.3b", OPTConfig, OPTForCausalLM, torch.float16, None), ("microsoft/Phi-3-mini-4k-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3"), ("microsoft/Phi-3.5-mini-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3.5"), ("Qwen/Qwen2.5-32B-Instruct", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5"), ("Qwen/Qwen2.5-Coder-0.5B", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5-Coder"), ("Qwen/Qwen3-8B", Qwen3Config, Qwen3ForCausalLM, torch.bfloat16, None), ]: revision = "refs/pr/14" if model_id == "Qwen/Qwen3-8B" else "main" # chat template with {% generation %} tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) generation_config = GenerationConfig.from_pretrained(model_id, revision=revision) config = config_class( vocab_size=len(tokenizer.vocab), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = model_class(config).to(dtype=dtype) init_weights_tiny_model(model) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) # MoE models for model_id, config_class, model_class, dtype, suffix in [ ("Qwen/Qwen3-30B-A3B", Qwen3MoeConfig, Qwen3MoeForCausalLM, torch.bfloat16, None), ("openai/gpt-oss-20b", GptOssConfig, GptOssForCausalLM, torch.bfloat16, None), ("zai-org/GLM-4.5", Glm4MoeConfig, Glm4MoeForCausalLM, torch.bfloat16, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) generation_config = GenerationConfig.from_pretrained(model_id) kwargs = {} if model_id == "zai-org/GLM-4.5": kwargs["n_routed_experts"] = 4 elif model_id == "Qwen/Qwen3-30B-A3B": kwargs["num_experts"] = 4 elif model_id == "openai/gpt-oss-20b": kwargs["num_local_experts"] = 4 config = config_class( vocab_size=len(tokenizer.vocab), hidden_size=8, num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, num_experts_per_tok=2, **kwargs, ) model = model_class(config).to(dtype=dtype) init_weights_tiny_model(model) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) # Two slightly bigger models, required for vLLM testing tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct") generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-32B-Instruct") config = Qwen2Config( vocab_size=len(tokenizer.vocab), hidden_size=128, # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16) push_to_hub(model, tokenizer, generation_config, "small", "2.5") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B") generation_config = GenerationConfig.from_pretrained("Qwen/Qwen3-4B") config = Qwen3Config( vocab_size=len(tokenizer.vocab), hidden_size=128, # increase hidden size so that hidden_size // num_attention_heads = 32, required for vLLM num_attention_heads=4, num_key_value_heads=2, num_hidden_layers=2, intermediate_size=32, ) model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16) push_to_hub(model, tokenizer, generation_config, "small") # Reward models for model_id, model_class, dtype, suffix in [ ("EleutherAI/pythia-14m", GPTNeoXForSequenceClassification, torch.bfloat16, None), ("meta-llama/Llama-3.2-1B-Instruct", LlamaForSequenceClassification, torch.bfloat16, "3.2"), ("Qwen/Qwen2.5-32B-Instruct", Qwen2ForSequenceClassification, torch.bfloat16, "2.5"), ("Qwen/Qwen3-4B", Qwen3ForSequenceClassification, torch.bfloat16, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) generation_config = GenerationConfig.from_pretrained(model_id) kwargs = { "num_labels": 1, "hidden_size": 16, "num_attention_heads": 4, "num_key_value_heads": 2, "num_hidden_layers": 2, "intermediate_size": 32, } config = AutoConfig.from_pretrained(model_id, **kwargs) # Bug in transformers: it ignores num_hidden_layers to build layer_types if model_id in ("Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen3-4B"): config.layer_types = config.layer_types[:2] model = model_class(config).to(dtype=dtype) init_weights_tiny_model(model) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) # MoE Reward models for model_id, model_class, dtype, suffix in [ ("Qwen/Qwen3-30B-A3B", Qwen3MoeForSequenceClassification, torch.bfloat16, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) generation_config = GenerationConfig.from_pretrained(model_id) kwargs = { "num_labels": 1, "hidden_size": 16, "num_attention_heads": 4, "num_key_value_heads": 2, "num_hidden_layers": 2, "intermediate_size": 32, "num_experts": 4, "num_experts_per_tok": 2, } config = AutoConfig.from_pretrained(model_id, **kwargs) model = model_class(config).to(dtype=dtype) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) # Encoder-decoder models for model_id, model_class, dtype, suffix in [ ("facebook/bart-base", BartModel, torch.float32, None), ("google/flan-t5-small", T5ForConditionalGeneration, torch.float32, None), ]: tokenizer = AutoTokenizer.from_pretrained(model_id) generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "facebook/bart-base" else None config = AutoConfig.from_pretrained(model_id) config.d_model = 24 model = model_class(config).to(dtype=dtype) push_to_hub(model, tokenizer, generation_config, "tiny", suffix) # Vision Language Models for model_id, model_class, dtype in [ ("google/gemma-3-4b-it", Gemma3ForConditionalGeneration, torch.bfloat16), ("google/gemma-4-E2B-it", Gemma4ForConditionalGeneration, torch.bfloat16), ("google/paligemma-3b-pt-224", PaliGemmaForConditionalGeneration, torch.float32), ("HuggingFaceM4/idefics2-8b", Idefics2ForConditionalGeneration, torch.float32), ("HuggingFaceM4/Idefics3-8B-Llama3", Idefics3ForConditionalGeneration, torch.bfloat16), ("HuggingFaceTB/SmolVLM2-2.2B-Instruct", SmolVLMForConditionalGeneration, torch.float32), ("llava-hf/llava-1.5-7b-hf", LlavaForConditionalGeneration, torch.float16), # Original model dtype is float16, but it triggers CUDA device side assert error (see GH-4741): ("llava-hf/llava-v1.6-mistral-7b-hf", LlavaNextForConditionalGeneration, torch.bfloat16), ("OpenGVLab/InternVL3-8B-hf", InternVLForConditionalGeneration, torch.bfloat16), ("Qwen/Qwen2-VL-2B-Instruct", Qwen2VLForConditionalGeneration, torch.bfloat16), ("Qwen/Qwen2.5-VL-3B-Instruct", Qwen2_5_VLForConditionalGeneration, torch.bfloat16), ("Qwen/Qwen3-VL-2B-Instruct", Qwen3VLForConditionalGeneration, torch.bfloat16), ("Qwen/Qwen3.5-0.8B", Qwen3_5ForConditionalGeneration, torch.bfloat16), ]: processor = AutoProcessor.from_pretrained(model_id) generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "Qwen/Qwen3.5-0.8B" else None text_config = { "num_hidden_layers": 2, "hidden_size": 16, "num_attention_heads": 4, "num_key_value_heads": 2, "layer_types": None, # Set it automatically from num_hidden_layers } vision_config = { "num_hidden_layers": 2, "hidden_size": 16, "num_attention_heads": 4, "num_key_value_heads": 2, "embed_dim": 64, } kwargs = {} if issubclass(model_class.config_class, (Qwen2VLConfig, Qwen2_5_VLConfig)): text_config["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"} vision_config["depth"] = 2 # Different dict object from text_config; see GH-4101 and transformers#41020 kwargs["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"} if issubclass(model_class.config_class, Qwen2_5_VLConfig): vision_config["out_hidden_size"] = 16 # Different dict object at the config root; see GH-4101 and transformers#41020 kwargs["num_hidden_layers"] = 2 kwargs["hidden_size"] = 16 kwargs["num_attention_heads"] = 4 if issubclass(model_class.config_class, Idefics2Config): kwargs["perceiver_config"] = {"hidden_size": 16} if issubclass(model_class.config_class, Qwen3VLConfig): # So hasattr(config, "layer_types") is False # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L420 del text_config["layer_types"] # "mrope_section" needs 3 elements: for dim, offset in enumerate((1, 2), start=1): mrope_section[dim] # See: https://github.com/huggingface/transformers/blob/fe5ca9ddaa07fac2872407e75c7a7661216ac956/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L361 text_config["rope_scaling"] = {"mrope_interleaved": True, "mrope_section": [2, 2, 2], "rope_type": "default"} vision_config["depth"] = 2 vision_config["out_hidden_size"] = 16 if issubclass(model_class.config_class, Qwen3_5Config): # For tiny layer counts, default `layer_types` can end up with no full-attention layers (e.g. 2 layers and # default interval 4), which breaks Qwen3.5 dynamic cache logic. Keep one full-attention layer at the end. text_config["layer_types"] = ["linear_attention", "full_attention"] text_config["full_attention_interval"] = 2 # Qwen3.5-VL vision config expects `depth`/`num_heads`, not `num_hidden_layers`/`num_attention_heads`. vision_config.pop("num_hidden_layers", None) vision_config.pop("num_attention_heads", None) vision_config.pop("num_key_value_heads", None) vision_config.pop("embed_dim", None) vision_config["depth"] = 2 vision_config["num_heads"] = 4 vision_config["intermediate_size"] = 32 vision_config["out_hidden_size"] = 16 if model_id == "llava-hf/llava-v1.6-mistral-7b-hf": # Hotfix: llava-hf/llava-v1.6-mistral-7b-hf mistakesly sets text_config.dtype to "bfloat16". # See https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf/discussions/46 text_config["dtype"] = None if model_class is Gemma4ForConditionalGeneration: # Gemma4 rope validation fails when passing text_config as a dict, so we mutate the config directly. config = AutoConfig.from_pretrained(model_id) for k, v in text_config.items(): setattr(config.text_config, k, v) for k, v in vision_config.items(): setattr(config.vision_config, k, v) config.text_config.layer_types = ["sliding_attention", "full_attention"] config.text_config.num_kv_shared_layers = 0 config.text_config.global_head_dim = 8 config.text_config.hidden_size_per_layer_input = 16 config.audio_config = None else: config = AutoConfig.from_pretrained(model_id, text_config=text_config, vision_config=vision_config, **kwargs) model = model_class(config).to(dtype=dtype) if issubclass(model_class.config_class, Qwen3_5Config): # Qwen3.5 models has some weights in float32, to mirror this in the tiny model we need to convert them to float32 manually. for layer in model.model.language_model.layers: if hasattr(layer, "linear_attn"): # applies to linear attention layers only layer.linear_attn.A_log.data = layer.linear_attn.A_log.data.float() layer.linear_attn.norm.weight.data = layer.linear_attn.norm.weight.data.float() push_to_hub(model, processor, generation_config, "tiny") # PEFT models model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto") model = get_peft_model(model, LoraConfig()) generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM") push_to_hub(model, None, None, "tiny") # Same model, but different weights model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto") model = get_peft_model(model, LoraConfig()) generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM") push_to_hub(model, None, None, "tiny", "2")