| | --- |
| | language: |
| | - ru |
| | - en |
| | license: mit |
| | tags: |
| | - deepseek_v3 |
| | - gigachat3 |
| | - testing |
| | - tiny |
| | library_name: transformers |
| | --- |
| | |
| | # Tiny model creation script: |
| |
|
| | ```pthon |
| | """ |
| | Create a tiny GigaChat3 model for testing . |
| | |
| | GigaChat3 uses DeepseekV3Config (no text_config/vision_config sub-objects). |
| | Key constraint: qk_head_dim == qk_nope_head_dim + qk_rope_head_dim |
| | """ |
| | import json |
| | import os |
| | |
| | import torch |
| | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
| | |
| | model_id = "ai-sage/GigaChat3-10B-A1.8B-bf16" |
| | output_dir = "./tiny-gigachat3" |
| | |
| | config = AutoConfig.from_pretrained(model_id) |
| | config.num_hidden_layers = 2 |
| | config.num_attention_heads = 2 |
| | config.num_key_value_heads = 2 |
| | config.hidden_size = 32 |
| | config.intermediate_size = 64 |
| | config.moe_intermediate_size = 32 |
| | config.n_routed_experts = 4 |
| | config.n_shared_experts = 1 |
| | config.num_experts_per_tok = 2 |
| | config.kv_lora_rank = 8 |
| | config.q_lora_rank = None |
| | |
| | # Attention head dims β MUST satisfy: qk_head_dim == qk_nope_head_dim + qk_rope_head_dim |
| | config.qk_nope_head_dim = 4 |
| | config.qk_rope_head_dim = 2 |
| | config.qk_head_dim = 6 # 4 + 2 |
| | config.v_head_dim = 4 |
| | config.head_dim = config.qk_rope_head_dim # used by RoPE |
| | |
| | TINY_VOCAB = 32000 |
| | config.vocab_size = TINY_VOCAB |
| | |
| | assert config.qk_head_dim == config.qk_nope_head_dim + config.qk_rope_head_dim |
| | |
| | os.makedirs(output_dir, exist_ok=True) |
| | model = AutoModelForCausalLM.from_config(config) |
| | model.save_pretrained(output_dir) |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | tokenizer.save_pretrained(output_dir) |
| | |
| | tok_path = os.path.join(output_dir, "tokenizer.json") |
| | with open(tok_path, encoding="utf-8") as f: |
| | tok_data = json.load(f) |
| | |
| | if "model" in tok_data and "vocab" in tok_data["model"]: |
| | tok_data["model"]["vocab"] = { |
| | k: v for k, v in tok_data["model"]["vocab"].items() if v < TINY_VOCAB |
| | } |
| | tok_data["model"]["merges"] = [] |
| | |
| | if "added_tokens" in tok_data: |
| | tok_data["added_tokens"] = [t for t in tok_data["added_tokens"] if t["id"] < TINY_VOCAB] |
| | |
| | with open(tok_path, "w", encoding="utf-8") as f: |
| | json.dump(tok_data, f, ensure_ascii=False) |
| | |
| | # ββ Smoke test βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| | tokens = tokenizer("Hello world", return_tensors="pt") |
| | tokens.pop("token_type_ids", None) |
| | with torch.no_grad(): |
| | out = model(**tokens) |
| | |
| | total_mb = sum(os.path.getsize(os.path.join(output_dir, fn)) for fn in os.listdir(output_dir)) / 1e6 |
| | print(f"shape={out.logits.shape} params={sum(p.numel() for p in model.parameters())/1e6:.2f}M size={total_mb:.1f} MB") |
| | |
| | ``` |