|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import torch
|
| from huggingface_hub import HfApi, ModelCard
|
| from peft import LoraConfig, get_peft_model
|
| from torch import nn
|
| from transformers import (
|
| AutoConfig,
|
| AutoProcessor,
|
| AutoTokenizer,
|
| BartModel,
|
| Cohere2Config,
|
| Cohere2ForCausalLM,
|
| CohereConfig,
|
| CohereForCausalLM,
|
| DeepseekV3Config,
|
| DeepseekV3ForCausalLM,
|
| FalconMambaConfig,
|
| FalconMambaForCausalLM,
|
| Gemma2Config,
|
| Gemma2ForCausalLM,
|
| Gemma3ForConditionalGeneration,
|
| Gemma4ForConditionalGeneration,
|
| GemmaConfig,
|
| GemmaForCausalLM,
|
| GenerationConfig,
|
| Glm4MoeConfig,
|
| Glm4MoeForCausalLM,
|
| GPT2Config,
|
| GPT2LMHeadModel,
|
| GPTNeoXConfig,
|
| GPTNeoXForCausalLM,
|
| GPTNeoXForSequenceClassification,
|
| GptOssConfig,
|
| GptOssForCausalLM,
|
| Idefics2Config,
|
| Idefics2ForConditionalGeneration,
|
| Idefics3ForConditionalGeneration,
|
| InternVLForConditionalGeneration,
|
| LlamaConfig,
|
| LlamaForCausalLM,
|
| LlamaForSequenceClassification,
|
| LlavaForConditionalGeneration,
|
| LlavaNextForConditionalGeneration,
|
| MistralConfig,
|
| MistralForCausalLM,
|
| OPTConfig,
|
| OPTForCausalLM,
|
| PaliGemmaForConditionalGeneration,
|
| Phi3Config,
|
| Phi3ForCausalLM,
|
| Qwen2_5_VLConfig,
|
| Qwen2_5_VLForConditionalGeneration,
|
| Qwen2Config,
|
| Qwen2ForCausalLM,
|
| Qwen2ForSequenceClassification,
|
| Qwen2VLConfig,
|
| Qwen2VLForConditionalGeneration,
|
| Qwen3_5Config,
|
| Qwen3_5ForConditionalGeneration,
|
| Qwen3Config,
|
| Qwen3ForCausalLM,
|
| Qwen3ForSequenceClassification,
|
| Qwen3MoeConfig,
|
| Qwen3MoeForCausalLM,
|
| Qwen3MoeForSequenceClassification,
|
| Qwen3VLConfig,
|
| Qwen3VLForConditionalGeneration,
|
| SmolVLMForConditionalGeneration,
|
| T5ForConditionalGeneration,
|
| )
|
|
|
|
|
| ORGANIZATION = "trl-internal-testing"
|
|
|
| MODEL_CARD = """
|
| ---
|
| library_name: transformers
|
| tags: [trl]
|
| ---
|
|
|
| # Tiny {model_class_name}
|
|
|
| This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library.
|
| """
|
|
|
|
|
| api = HfApi()
|
|
|
|
|
| def push_to_hub(model, tokenizer, generation_config, prefix=None, suffix=None, force=False):
|
| model_class_name = model.__class__.__name__
|
| content = MODEL_CARD.format(model_class_name=model_class_name)
|
| model_card = ModelCard(content)
|
| if prefix is not None:
|
| model_class_name = f"{prefix}-{model_class_name}"
|
| repo_id = f"{ORGANIZATION}/{model_class_name}"
|
| if suffix is not None:
|
| repo_id += f"-{suffix}"
|
|
|
| if api.repo_exists(repo_id) and not force:
|
| print(f"Model {repo_id} already exists, skipping")
|
| else:
|
| model.push_to_hub(repo_id)
|
| model_card.push_to_hub(repo_id)
|
| if tokenizer is not None:
|
| tokenizer.push_to_hub(repo_id)
|
| if generation_config is not None:
|
| generation_config.push_to_hub(repo_id)
|
|
|
|
|
| def init_weights_tiny_model(model):
|
| """
|
| Initialize tiny test models to avoid NaNs from uninitialized weights.
|
|
|
| Uses safe defaults:
|
| - Linear/Conv1d: Xavier uniform (weights), zero (biases)
|
| - Embedding: Normal(0, 0.02)
|
| - LayerNorm: Ones (weights), zero (biases)
|
|
|
| Args:
|
| model: PyTorch model (modified in-place)
|
| """
|
| for module in model.modules():
|
| if isinstance(module, nn.Linear):
|
|
|
| if module.bias is not None:
|
| nn.init.zeros_(module.bias)
|
| nn.init.xavier_uniform_(module.weight)
|
|
|
| elif isinstance(module, nn.Embedding):
|
|
|
| nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
|
| elif isinstance(module, nn.LayerNorm):
|
|
|
| nn.init.ones_(module.weight)
|
| if module.bias is not None:
|
| nn.init.zeros_(module.bias)
|
|
|
| elif isinstance(module, nn.Conv1d):
|
|
|
| if module.bias is not None:
|
| nn.init.zeros_(module.bias)
|
| nn.init.xavier_uniform_(module.weight)
|
|
|
|
|
|
|
| for model_id, config_class, model_class, dtype, suffix in [
|
|
|
| ("CohereLabs/aya-expanse-8b", CohereConfig, CohereForCausalLM, torch.float16, None),
|
| ("CohereLabs/tiny-aya-earth", Cohere2Config, Cohere2ForCausalLM, torch.bfloat16, None),
|
| ("deepseek-ai/DeepSeek-R1", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, None),
|
|
|
| ("deepseek-ai/DeepSeek-R1-0528", DeepseekV3Config, DeepseekV3ForCausalLM, torch.bfloat16, "0528"),
|
| ("tiiuae/falcon-7b-instruct", FalconMambaConfig, FalconMambaForCausalLM, torch.bfloat16, None),
|
| ("google/gemma-2-2b-it", Gemma2Config, Gemma2ForCausalLM, torch.bfloat16, None),
|
| ("google/gemma-7b-it", GemmaConfig, GemmaForCausalLM, torch.bfloat16, None),
|
| ("openai-community/gpt2", GPT2Config, GPT2LMHeadModel, torch.float32, None),
|
| ("EleutherAI/pythia-14m", GPTNeoXConfig, GPTNeoXForCausalLM, torch.float16, None),
|
| ("meta-llama/Meta-Llama-3-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3"),
|
| ("meta-llama/Llama-3.1-8B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.1"),
|
| ("meta-llama/Llama-3.2-1B-Instruct", LlamaConfig, LlamaForCausalLM, torch.bfloat16, "3.2"),
|
| ("mistralai/Mistral-7B-Instruct-v0.1", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.1"),
|
| ("mistralai/Mistral-7B-Instruct-v0.2", MistralConfig, MistralForCausalLM, torch.bfloat16, "0.2"),
|
| ("facebook/opt-1.3b", OPTConfig, OPTForCausalLM, torch.float16, None),
|
| ("microsoft/Phi-3-mini-4k-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3"),
|
| ("microsoft/Phi-3.5-mini-instruct", Phi3Config, Phi3ForCausalLM, torch.bfloat16, "3.5"),
|
| ("Qwen/Qwen2.5-32B-Instruct", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5"),
|
| ("Qwen/Qwen2.5-Coder-0.5B", Qwen2Config, Qwen2ForCausalLM, torch.bfloat16, "2.5-Coder"),
|
| ("Qwen/Qwen3-8B", Qwen3Config, Qwen3ForCausalLM, torch.bfloat16, None),
|
| ]:
|
| revision = "refs/pr/14" if model_id == "Qwen/Qwen3-8B" else "main"
|
| tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
| generation_config = GenerationConfig.from_pretrained(model_id, revision=revision)
|
| config = config_class(
|
| vocab_size=len(tokenizer.vocab),
|
| hidden_size=8,
|
| num_attention_heads=4,
|
| num_key_value_heads=2,
|
| num_hidden_layers=2,
|
| intermediate_size=32,
|
| )
|
| model = model_class(config).to(dtype=dtype)
|
| init_weights_tiny_model(model)
|
| push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
|
|
|
|
|
| for model_id, config_class, model_class, dtype, suffix in [
|
| ("Qwen/Qwen3-30B-A3B", Qwen3MoeConfig, Qwen3MoeForCausalLM, torch.bfloat16, None),
|
| ("openai/gpt-oss-20b", GptOssConfig, GptOssForCausalLM, torch.bfloat16, None),
|
| ("zai-org/GLM-4.5", Glm4MoeConfig, Glm4MoeForCausalLM, torch.bfloat16, None),
|
| ]:
|
| tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| generation_config = GenerationConfig.from_pretrained(model_id)
|
| kwargs = {}
|
| if model_id == "zai-org/GLM-4.5":
|
| kwargs["n_routed_experts"] = 4
|
| elif model_id == "Qwen/Qwen3-30B-A3B":
|
| kwargs["num_experts"] = 4
|
| elif model_id == "openai/gpt-oss-20b":
|
| kwargs["num_local_experts"] = 4
|
|
|
| config = config_class(
|
| vocab_size=len(tokenizer.vocab),
|
| hidden_size=8,
|
| num_attention_heads=4,
|
| num_key_value_heads=2,
|
| num_hidden_layers=2,
|
| intermediate_size=32,
|
| num_experts_per_tok=2,
|
| **kwargs,
|
| )
|
| model = model_class(config).to(dtype=dtype)
|
| init_weights_tiny_model(model)
|
| push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
|
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
|
| generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
|
| config = Qwen2Config(
|
| vocab_size=len(tokenizer.vocab),
|
| hidden_size=128,
|
| num_attention_heads=4,
|
| num_key_value_heads=2,
|
| num_hidden_layers=2,
|
| intermediate_size=32,
|
| )
|
| model = Qwen2ForCausalLM(config).to(dtype=torch.bfloat16)
|
| push_to_hub(model, tokenizer, generation_config, "small", "2.5")
|
|
|
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
|
| generation_config = GenerationConfig.from_pretrained("Qwen/Qwen3-4B")
|
| config = Qwen3Config(
|
| vocab_size=len(tokenizer.vocab),
|
| hidden_size=128,
|
| num_attention_heads=4,
|
| num_key_value_heads=2,
|
| num_hidden_layers=2,
|
| intermediate_size=32,
|
| )
|
| model = Qwen3ForCausalLM(config).to(dtype=torch.bfloat16)
|
| push_to_hub(model, tokenizer, generation_config, "small")
|
|
|
|
|
| for model_id, model_class, dtype, suffix in [
|
| ("EleutherAI/pythia-14m", GPTNeoXForSequenceClassification, torch.bfloat16, None),
|
| ("meta-llama/Llama-3.2-1B-Instruct", LlamaForSequenceClassification, torch.bfloat16, "3.2"),
|
| ("Qwen/Qwen2.5-32B-Instruct", Qwen2ForSequenceClassification, torch.bfloat16, "2.5"),
|
| ("Qwen/Qwen3-4B", Qwen3ForSequenceClassification, torch.bfloat16, None),
|
| ]:
|
| tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| generation_config = GenerationConfig.from_pretrained(model_id)
|
| kwargs = {
|
| "num_labels": 1,
|
| "hidden_size": 16,
|
| "num_attention_heads": 4,
|
| "num_key_value_heads": 2,
|
| "num_hidden_layers": 2,
|
| "intermediate_size": 32,
|
| }
|
| config = AutoConfig.from_pretrained(model_id, **kwargs)
|
|
|
| if model_id in ("Qwen/Qwen2.5-32B-Instruct", "Qwen/Qwen3-4B"):
|
| config.layer_types = config.layer_types[:2]
|
| model = model_class(config).to(dtype=dtype)
|
| init_weights_tiny_model(model)
|
| push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
|
|
|
|
|
| for model_id, model_class, dtype, suffix in [
|
| ("Qwen/Qwen3-30B-A3B", Qwen3MoeForSequenceClassification, torch.bfloat16, None),
|
| ]:
|
| tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| generation_config = GenerationConfig.from_pretrained(model_id)
|
| kwargs = {
|
| "num_labels": 1,
|
| "hidden_size": 16,
|
| "num_attention_heads": 4,
|
| "num_key_value_heads": 2,
|
| "num_hidden_layers": 2,
|
| "intermediate_size": 32,
|
| "num_experts": 4,
|
| "num_experts_per_tok": 2,
|
| }
|
| config = AutoConfig.from_pretrained(model_id, **kwargs)
|
| model = model_class(config).to(dtype=dtype)
|
| push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
|
|
|
|
|
|
|
| for model_id, model_class, dtype, suffix in [
|
| ("facebook/bart-base", BartModel, torch.float32, None),
|
| ("google/flan-t5-small", T5ForConditionalGeneration, torch.float32, None),
|
| ]:
|
| tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "facebook/bart-base" else None
|
| config = AutoConfig.from_pretrained(model_id)
|
| config.d_model = 24
|
| model = model_class(config).to(dtype=dtype)
|
| push_to_hub(model, tokenizer, generation_config, "tiny", suffix)
|
|
|
|
|
|
|
| for model_id, model_class, dtype in [
|
| ("google/gemma-3-4b-it", Gemma3ForConditionalGeneration, torch.bfloat16),
|
| ("google/gemma-4-E2B-it", Gemma4ForConditionalGeneration, torch.bfloat16),
|
| ("google/paligemma-3b-pt-224", PaliGemmaForConditionalGeneration, torch.float32),
|
| ("HuggingFaceM4/idefics2-8b", Idefics2ForConditionalGeneration, torch.float32),
|
| ("HuggingFaceM4/Idefics3-8B-Llama3", Idefics3ForConditionalGeneration, torch.bfloat16),
|
| ("HuggingFaceTB/SmolVLM2-2.2B-Instruct", SmolVLMForConditionalGeneration, torch.float32),
|
| ("llava-hf/llava-1.5-7b-hf", LlavaForConditionalGeneration, torch.float16),
|
|
|
| ("llava-hf/llava-v1.6-mistral-7b-hf", LlavaNextForConditionalGeneration, torch.bfloat16),
|
| ("OpenGVLab/InternVL3-8B-hf", InternVLForConditionalGeneration, torch.bfloat16),
|
| ("Qwen/Qwen2-VL-2B-Instruct", Qwen2VLForConditionalGeneration, torch.bfloat16),
|
| ("Qwen/Qwen2.5-VL-3B-Instruct", Qwen2_5_VLForConditionalGeneration, torch.bfloat16),
|
| ("Qwen/Qwen3-VL-2B-Instruct", Qwen3VLForConditionalGeneration, torch.bfloat16),
|
| ("Qwen/Qwen3.5-0.8B", Qwen3_5ForConditionalGeneration, torch.bfloat16),
|
| ]:
|
| processor = AutoProcessor.from_pretrained(model_id)
|
| generation_config = GenerationConfig.from_pretrained(model_id) if model_id != "Qwen/Qwen3.5-0.8B" else None
|
|
|
| text_config = {
|
| "num_hidden_layers": 2,
|
| "hidden_size": 16,
|
| "num_attention_heads": 4,
|
| "num_key_value_heads": 2,
|
| "layer_types": None,
|
| }
|
| vision_config = {
|
| "num_hidden_layers": 2,
|
| "hidden_size": 16,
|
| "num_attention_heads": 4,
|
| "num_key_value_heads": 2,
|
| "embed_dim": 64,
|
| }
|
| kwargs = {}
|
|
|
| if issubclass(model_class.config_class, (Qwen2VLConfig, Qwen2_5_VLConfig)):
|
| text_config["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"}
|
| vision_config["depth"] = 2
|
|
|
| kwargs["rope_scaling"] = {"type": "default", "mrope_section": [1, 1], "rope_type": "default"}
|
|
|
| if issubclass(model_class.config_class, Qwen2_5_VLConfig):
|
| vision_config["out_hidden_size"] = 16
|
|
|
| kwargs["num_hidden_layers"] = 2
|
| kwargs["hidden_size"] = 16
|
| kwargs["num_attention_heads"] = 4
|
|
|
| if issubclass(model_class.config_class, Idefics2Config):
|
| kwargs["perceiver_config"] = {"hidden_size": 16}
|
|
|
| if issubclass(model_class.config_class, Qwen3VLConfig):
|
|
|
|
|
| del text_config["layer_types"]
|
|
|
|
|
| text_config["rope_scaling"] = {"mrope_interleaved": True, "mrope_section": [2, 2, 2], "rope_type": "default"}
|
| vision_config["depth"] = 2
|
| vision_config["out_hidden_size"] = 16
|
|
|
| if issubclass(model_class.config_class, Qwen3_5Config):
|
|
|
|
|
| text_config["layer_types"] = ["linear_attention", "full_attention"]
|
| text_config["full_attention_interval"] = 2
|
|
|
| vision_config.pop("num_hidden_layers", None)
|
| vision_config.pop("num_attention_heads", None)
|
| vision_config.pop("num_key_value_heads", None)
|
| vision_config.pop("embed_dim", None)
|
| vision_config["depth"] = 2
|
| vision_config["num_heads"] = 4
|
| vision_config["intermediate_size"] = 32
|
| vision_config["out_hidden_size"] = 16
|
|
|
| if model_id == "llava-hf/llava-v1.6-mistral-7b-hf":
|
|
|
|
|
| text_config["dtype"] = None
|
|
|
| if model_class is Gemma4ForConditionalGeneration:
|
|
|
| config = AutoConfig.from_pretrained(model_id)
|
| for k, v in text_config.items():
|
| setattr(config.text_config, k, v)
|
| for k, v in vision_config.items():
|
| setattr(config.vision_config, k, v)
|
| config.text_config.layer_types = ["sliding_attention", "full_attention"]
|
| config.text_config.num_kv_shared_layers = 0
|
| config.text_config.global_head_dim = 8
|
| config.text_config.hidden_size_per_layer_input = 16
|
| config.audio_config = None
|
| else:
|
| config = AutoConfig.from_pretrained(model_id, text_config=text_config, vision_config=vision_config, **kwargs)
|
| model = model_class(config).to(dtype=dtype)
|
|
|
| if issubclass(model_class.config_class, Qwen3_5Config):
|
|
|
| for layer in model.model.language_model.layers:
|
| if hasattr(layer, "linear_attn"):
|
| layer.linear_attn.A_log.data = layer.linear_attn.A_log.data.float()
|
| layer.linear_attn.norm.weight.data = layer.linear_attn.norm.weight.data.float()
|
|
|
| push_to_hub(model, processor, generation_config, "tiny")
|
|
|
|
|
| model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto")
|
| model = get_peft_model(model, LoraConfig())
|
| generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM")
|
| push_to_hub(model, None, None, "tiny")
|
|
|
|
|
| model = Qwen3ForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM", dtype="auto")
|
| model = get_peft_model(model, LoraConfig())
|
| generation_config = GenerationConfig.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM")
|
| push_to_hub(model, None, None, "tiny", "2")
|
|
|