Training in progress, step 5800

Browse files

Files changed (5) hide show

README.md +3 -3
config.json +85 -54
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: google/gemma-3-1b-it
 library_name: transformers
 model_name: nmt_21
 tags:
@@ -11,7 +11,7 @@ licence: license
 # Model Card for nmt_21
-This model is a fine-tuned version of [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/oleg-dats/nmt/runs/1oefvyo4)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ---
+base_model: google/gemma-3-4b-it
 library_name: transformers
 model_name: nmt_21
 tags:
 # Model Card for nmt_21
+This model is a fine-tuned version of [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/oleg-dats/nmt/runs/yn1sslos)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

config.json CHANGED Viewed

@@ -1,64 +1,95 @@
 {
-  "_sliding_window_pattern": 6,
   "architectures": [
-    "Gemma3ForCausalLM"
   ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "attn_logit_softcapping": null,
   "bos_token_id": 2,
-  "cache_implementation": "hybrid",
   "dtype": "bfloat16",
   "eos_token_id": 1,
-  "final_logit_softcapping": null,
-  "head_dim": 256,
-  "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
   "initializer_range": 0.02,
-  "intermediate_size": 6912,
-  "layer_types": [
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention"
-  ],
-  "max_position_embeddings": 32768,
-  "model_type": "gemma3_text",
-  "num_attention_heads": 4,
-  "num_hidden_layers": 26,
-  "num_key_value_heads": 1,
   "pad_token_id": 0,
-  "query_pre_attn_scalar": 256,
-  "rms_norm_eps": 1e-06,
-  "rope_local_base_freq": 10000,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": 512,
-  "sliding_window_pattern": 6,
   "transformers_version": "4.57.0",
-  "use_bidirectional_attention": false,
-  "use_cache": true,
-  "vocab_size": 262144
 }

 {
   "architectures": [
+    "Gemma3ForConditionalGeneration"
   ],
+  "boi_token_index": 255999,
   "bos_token_id": 2,
   "dtype": "bfloat16",
+  "eoi_token_index": 256000,
   "eos_token_id": 1,
+  "image_token_index": 262144,
   "initializer_range": 0.02,
+  "mm_tokens_per_image": 256,
+  "model_type": "gemma3",
   "pad_token_id": 0,
+  "text_config": {
+    "_sliding_window_pattern": 6,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "final_logit_softcapping": null,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gemma3_text",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 34,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": {
+      "factor": 8.0,
+      "rope_type": "linear"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": 1024,
+    "use_bidirectional_attention": false,
+    "use_cache": true,
+    "vocab_size": 262208
+  },
   "transformers_version": "4.57.0",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "vision_use_head": false
+  }
 }

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e904c326c0afb55cce8a33df889a4ceab24ea9c9a586a0c3feb7febf834b2a4f
 size 4961251752

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a653ce67ccca35b508e079bb5328530c791b3e71f267b172865041b6b04e88e
 size 4961251752

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce36aaeda348a869c066fbeb88dd662af65a0cef6445c6a10e6fca78bc27f7b5
 size 3639026128

 version https://git-lfs.github.com/spec/v1
+oid sha256:24d395bd995ee0eb269aab4db2c4197c2515af165dc9f1c94bbc3314f2dace51
 size 3639026128

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5dcc949cc10c8ffc858cb199cdb4ab6d028edbb9888ede489a27d7d06d3733dd
 size 6840

 version https://git-lfs.github.com/spec/v1
+oid sha256:54eac288f57ef58be29ddb0e8adea9bc712650afeaf8d7d4f42255c44a942cbb
 size 6840