leonMW
/

DeepSeek-R1-Distill-Qwen-7B-GSPO-Basic

Text Generation

Generated from Trainer

text-generation-inference

Model card Files Files and versions

leonMW commited on Aug 18, 2025

Commit

345e44d

·

verified ·

1 Parent(s): 0f6e5df

Update config.json

Files changed (1) hide show

config.json +33 -2

config.json CHANGED Viewed

@@ -9,6 +9,36 @@
   "hidden_size": 3584,
   "initializer_range": 0.02,
   "intermediate_size": 18944,
   "max_position_embeddings": 131072,
   "max_window_layers": 28,
   "model_type": "qwen2",
@@ -16,11 +46,12 @@
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
   "rms_norm_eps": 1e-06,
   "rope_theta": 10000,
-  "sliding_window": 4096,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.0",
   "use_cache": true,
   "use_mrope": false,
   "use_sliding_window": false,

   "hidden_size": 3584,
   "initializer_range": 0.02,
   "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
   "max_position_embeddings": 131072,
   "max_window_layers": 28,
   "model_type": "qwen2",
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,
   "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
   "rope_theta": 10000,
+  "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
   "use_cache": true,
   "use_mrope": false,
   "use_sliding_window": false,