shumi2011 commited on 20 days ago

Commit

8c408f0

verified ·

1 Parent(s): f50e1cd

Upload LoRA adapter và processor sau khi train

Browse files

Files changed (20) hide show

.gitattributes +1 -0
README.md +1 -1
adapter_config.json +4 -4
adapter_model.safetensors +1 -1
checkpoint-563/README.md +209 -0
checkpoint-563/adapter_config.json +46 -0
checkpoint-563/adapter_model.safetensors +3 -0
checkpoint-563/chat_template.jinja +47 -0
checkpoint-563/optimizer.pt +3 -0
checkpoint-563/processor_config.json +28 -0
checkpoint-563/rng_state.pth +3 -0
checkpoint-563/scheduler.pt +3 -0
checkpoint-563/tokenizer.json +3 -0
checkpoint-563/tokenizer_config.json +24 -0
checkpoint-563/trainer_state.json +605 -0
checkpoint-563/training_args.bin +3 -0
runs/Mar31_04-20-31_242ca503c882/events.out.tfevents.1774930831.242ca503c882.3260.0 +3 -0
runs/Mar31_06-15-17_242ca503c882/events.out.tfevents.1774937717.242ca503c882.32504.0 +3 -0
tokenizer_config.json +24 -0
training_args.bin +2 -2

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-402/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-402/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-563/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -39,7 +39,7 @@ This model was trained with SFT.
 ### Framework versions
 - PEFT 0.18.1
-- TRL: 0.29.1
 - Transformers: 5.4.0
 - Pytorch: 2.10.0+cu128
 - Datasets: 4.8.4

 ### Framework versions
 - PEFT 0.18.1
+- TRL: 1.0.0
 - Transformers: 5.4.0
 - Pytorch: 2.10.0+cu128
 - Datasets: 4.8.4

adapter_config.json CHANGED Viewed

@@ -30,12 +30,12 @@
   "revision": null,
   "target_modules": [
     "k_proj",
-    "o_proj",
-    "down_proj",
-    "q_proj",
     "gate_proj",
     "up_proj",
-    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "revision": null,
   "target_modules": [
     "k_proj",
     "gate_proj",
+    "v_proj",
     "up_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43a8ea11e2eceebf2c89454ce0ca1845dcfed115a1cfc69b7d8be2fa14e5c8cc
 size 65675408

 version https://git-lfs.github.com/spec/v1
+oid sha256:4cb55cb6b8228cc478ad98d9207b252096003053d7d2de477994351d55ef57a0
 size 65675408

checkpoint-563/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoint-563/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-563/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cb55cb6b8228cc478ad98d9207b252096003053d7d2de477994351d55ef57a0
+size 65675408

checkpoint-563/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

checkpoint-563/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:147ecdea7929fde43e244c73dd544abb2ed359ded440d922e8dee6f5dc3009fe
+size 119618359

checkpoint-563/processor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "image_processor": {
+    "do_convert_rgb": null,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "Gemma3ImageProcessor",
+    "image_seq_length": 256,
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "resample": 2,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "height": 896,
+      "width": 896
+    }
+  },
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

checkpoint-563/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3cd3621b8b24d40d23ff41931c6ba8079e1bd59ff5085e42505384d72c06a13
+size 14709

checkpoint-563/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fae9377d103c0880629be5550a0dfd948a5c6f4777668a5c22004bc9daed5d9f
+size 1465

checkpoint-563/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:daab2354f8a74e70d70b4d1f804939b68a8c9624dd06cb7858e52dd8970e9726
+size 33384567

checkpoint-563/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "backend": "tokenizers",
+  "boi_token": "<start_of_image>",
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eoi_token": "<end_of_image>",
+  "eos_token": "<eos>",
+  "image_token": "<image_soft_token>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "boi_token": "<start_of_image>",
+    "eoi_token": "<end_of_image>",
+    "image_token": "<image_soft_token>"
+  },
+  "pad_token": "<pad>",
+  "processor_class": "Gemma3Processor",
+  "sp_model_kwargs": null,
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-563/trainer_state.json ADDED Viewed

	@@ -0,0 +1,605 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 563,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2997896295040845,
+      "epoch": 0.017777777777777778,
+      "grad_norm": 1.3046875,
+      "learning_rate": 1.6071428571428572e-05,
+      "loss": 5.099749374389648,
+      "mean_token_accuracy": 0.4248364338651299,
+      "num_tokens": 15791.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.6972164057195187,
+      "epoch": 0.035555555555555556,
+      "grad_norm": 0.333984375,
+      "learning_rate": 3.392857142857143e-05,
+      "loss": 4.261620330810547,
+      "mean_token_accuracy": 0.42575850784778596,
+      "num_tokens": 30788.0,
+      "step": 20
+    },
+    {
+      "entropy": 2.371405544877052,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 5.1785714285714296e-05,
+      "loss": 3.410958480834961,
+      "mean_token_accuracy": 0.454820466786623,
+      "num_tokens": 45784.0,
+      "step": 30
+    },
+    {
+      "entropy": 2.792352019995451,
+      "epoch": 0.07111111111111111,
+      "grad_norm": 0.388671875,
+      "learning_rate": 6.964285714285715e-05,
+      "loss": 2.760978126525879,
+      "mean_token_accuracy": 0.5088610924780369,
+      "num_tokens": 62673.0,
+      "step": 40
+    },
+    {
+      "entropy": 2.2824966341257094,
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.35546875,
+      "learning_rate": 8.75e-05,
+      "loss": 2.191845512390137,
+      "mean_token_accuracy": 0.5701939344406128,
+      "num_tokens": 76957.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.8100679598748683,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.31640625,
+      "learning_rate": 9.999136119166803e-05,
+      "loss": 1.7198040008544921,
+      "mean_token_accuracy": 0.650282097607851,
+      "num_tokens": 90947.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.237974463403225,
+      "epoch": 0.12444444444444444,
+      "grad_norm": 0.337890625,
+      "learning_rate": 9.983786540671051e-05,
+      "loss": 1.1844627380371093,
+      "mean_token_accuracy": 0.7430308632552624,
+      "num_tokens": 105311.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.8024648647755385,
+      "epoch": 0.14222222222222222,
+      "grad_norm": 0.466796875,
+      "learning_rate": 9.949307432339625e-05,
+      "loss": 0.8033592224121093,
+      "mean_token_accuracy": 0.8211114652454853,
+      "num_tokens": 120739.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.6481792986392975,
+      "epoch": 0.16,
+      "grad_norm": 0.3515625,
+      "learning_rate": 9.895831137146318e-05,
+      "loss": 0.6663334846496582,
+      "mean_token_accuracy": 0.8503929443657399,
+      "num_tokens": 135550.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.5209066523239017,
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.220703125,
+      "learning_rate": 9.82356291596578e-05,
+      "loss": 0.5501353740692139,
+      "mean_token_accuracy": 0.8712642557919026,
+      "num_tokens": 150124.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.4373117024078965,
+      "epoch": 0.19555555555555557,
+      "grad_norm": 0.267578125,
+      "learning_rate": 9.732780159709912e-05,
+      "loss": 0.3997534513473511,
+      "mean_token_accuracy": 0.9081158190965652,
+      "num_tokens": 164899.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.36014524921774865,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.185546875,
+      "learning_rate": 9.623831324603754e-05,
+      "loss": 0.3664620161056519,
+      "mean_token_accuracy": 0.9099370762705803,
+      "num_tokens": 180128.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.29999935105443,
+      "epoch": 0.2311111111111111,
+      "grad_norm": 0.255859375,
+      "learning_rate": 9.497134594687634e-05,
+      "loss": 0.2984978437423706,
+      "mean_token_accuracy": 0.9242603577673435,
+      "num_tokens": 195976.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.31300447108224033,
+      "epoch": 0.24888888888888888,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 9.353176276679396e-05,
+      "loss": 0.32675857543945314,
+      "mean_token_accuracy": 0.9216666355729103,
+      "num_tokens": 211055.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.2710464348085225,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.2578125,
+      "learning_rate": 9.192508933357753e-05,
+      "loss": 0.2727458715438843,
+      "mean_token_accuracy": 0.9310060679912567,
+      "num_tokens": 227341.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.30849818857386707,
+      "epoch": 0.28444444444444444,
+      "grad_norm": 0.271484375,
+      "learning_rate": 9.015749262631536e-05,
+      "loss": 0.30527050495147706,
+      "mean_token_accuracy": 0.9228560633957386,
+      "num_tokens": 242924.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.282581620849669,
+      "epoch": 0.3022222222222222,
+      "grad_norm": 0.3828125,
+      "learning_rate": 8.823575730435693e-05,
+      "loss": 0.2990081548690796,
+      "mean_token_accuracy": 0.9226435236632824,
+      "num_tokens": 256777.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.2864753663539886,
+      "epoch": 0.32,
+      "grad_norm": 0.35546875,
+      "learning_rate": 8.616725966539832e-05,
+      "loss": 0.3073178768157959,
+      "mean_token_accuracy": 0.9258665904402733,
+      "num_tokens": 272913.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.26701974822208285,
+      "epoch": 0.3377777777777778,
+      "grad_norm": 0.2578125,
+      "learning_rate": 8.395993933265101e-05,
+      "loss": 0.261165452003479,
+      "mean_token_accuracy": 0.9338324561715126,
+      "num_tokens": 287769.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.23425031434744598,
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 8.162226877976887e-05,
+      "loss": 0.21241703033447265,
+      "mean_token_accuracy": 0.9439801961183548,
+      "num_tokens": 303589.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.26449646847322583,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.271484375,
+      "learning_rate": 7.916322081050709e-05,
+      "loss": 0.27304508686065676,
+      "mean_token_accuracy": 0.9296576961874962,
+      "num_tokens": 317660.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.15648896424099804,
+      "epoch": 0.39111111111111113,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 7.659223411793798e-05,
+      "loss": 0.15494911670684813,
+      "mean_token_accuracy": 0.9541799262166023,
+      "num_tokens": 333271.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.21079173348844052,
+      "epoch": 0.4088888888888889,
+      "grad_norm": 0.283203125,
+      "learning_rate": 7.391917705541927e-05,
+      "loss": 0.22306714057922364,
+      "mean_token_accuracy": 0.9384914793074131,
+      "num_tokens": 347985.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.2048337606713176,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.126953125,
+      "learning_rate": 7.115430975837457e-05,
+      "loss": 0.19801312685012817,
+      "mean_token_accuracy": 0.9438701763749122,
+      "num_tokens": 363032.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.22145606707781554,
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.115234375,
+      "learning_rate": 6.830824476227646e-05,
+      "loss": 0.21796202659606934,
+      "mean_token_accuracy": 0.9396323539316654,
+      "num_tokens": 377070.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.22976100631058216,
+      "epoch": 0.4622222222222222,
+      "grad_norm": 0.21484375,
+      "learning_rate": 6.539190626799366e-05,
+      "loss": 0.21554169654846192,
+      "mean_token_accuracy": 0.938993276655674,
+      "num_tokens": 392460.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.16580691728740932,
+      "epoch": 0.48,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 6.241648821085666e-05,
+      "loss": 0.16170507669448853,
+      "mean_token_accuracy": 0.9502397567033768,
+      "num_tokens": 408721.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.17671420807018876,
+      "epoch": 0.49777777777777776,
+      "grad_norm": 0.2890625,
+      "learning_rate": 5.939341129438739e-05,
+      "loss": 0.19148651361465455,
+      "mean_token_accuracy": 0.9441343322396278,
+      "num_tokens": 423511.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.19226484168320895,
+      "epoch": 0.5155555555555555,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 5.633427915361261e-05,
+      "loss": 0.1809281587600708,
+      "mean_token_accuracy": 0.9457803666591644,
+      "num_tokens": 438350.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.1846911649219692,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 5.325083381622165e-05,
+      "loss": 0.19349638223648072,
+      "mean_token_accuracy": 0.9445783741772175,
+      "num_tokens": 454504.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.19011163590475916,
+      "epoch": 0.5511111111111111,
+      "grad_norm": 0.146484375,
+      "learning_rate": 5.01549106325243e-05,
+      "loss": 0.1816372752189636,
+      "mean_token_accuracy": 0.9470942720770836,
+      "num_tokens": 469888.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.21089051999151706,
+      "epoch": 0.5688888888888889,
+      "grad_norm": 0.142578125,
+      "learning_rate": 4.705839284720376e-05,
+      "loss": 0.20321893692016602,
+      "mean_token_accuracy": 0.9423739515244961,
+      "num_tokens": 484515.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.15668439203873277,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.099609375,
+      "learning_rate": 4.397316598723385e-05,
+      "loss": 0.152730131149292,
+      "mean_token_accuracy": 0.954561373591423,
+      "num_tokens": 500366.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.19265495147556067,
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 4.0911072241036194e-05,
+      "loss": 0.18428765535354613,
+      "mean_token_accuracy": 0.9454522147774697,
+      "num_tokens": 514445.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.14791145911440254,
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.126953125,
+      "learning_rate": 3.788386500398583e-05,
+      "loss": 0.15320039987564088,
+      "mean_token_accuracy": 0.9516465291380882,
+      "num_tokens": 529805.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.14857742255553602,
+      "epoch": 0.64,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 3.49031637647361e-05,
+      "loss": 0.1594814896583557,
+      "mean_token_accuracy": 0.9535981945693492,
+      "num_tokens": 545883.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.2208730909973383,
+      "epoch": 0.6577777777777778,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 3.1980409505524544e-05,
+      "loss": 0.19757508039474486,
+      "mean_token_accuracy": 0.9436426095664501,
+      "num_tokens": 559836.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.1493451208807528,
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.12109375,
+      "learning_rate": 2.91268207876494e-05,
+      "loss": 0.15581512451171875,
+      "mean_token_accuracy": 0.9540682502090931,
+      "num_tokens": 575627.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.13959282217547297,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 2.635335069067617e-05,
+      "loss": 0.12903414964675902,
+      "mean_token_accuracy": 0.9592652179300785,
+      "num_tokens": 590314.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.16119700381532312,
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 2.367064477065652e-05,
+      "loss": 0.14327847957611084,
+      "mean_token_accuracy": 0.9542674884200096,
+      "num_tokens": 605721.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.151368910074234,
+      "epoch": 0.7288888888888889,
+      "grad_norm": 0.1484375,
+      "learning_rate": 2.108900019873103e-05,
+      "loss": 0.14610207080841064,
+      "mean_token_accuracy": 0.9552296213805676,
+      "num_tokens": 621020.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.14151866482570769,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.11328125,
+      "learning_rate": 1.8618326236955907e-05,
+      "loss": 0.13566198348999023,
+      "mean_token_accuracy": 0.9568316303193569,
+      "num_tokens": 636597.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.1339370148256421,
+      "epoch": 0.7644444444444445,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 1.626810620306163e-05,
+      "loss": 0.1301966667175293,
+      "mean_token_accuracy": 0.957643074542284,
+      "num_tokens": 652071.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.15190135017037393,
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 1.4047361070135995e-05,
+      "loss": 0.1389269709587097,
+      "mean_token_accuracy": 0.9543100669980049,
+      "num_tokens": 668341.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.18330826219171287,
+      "epoch": 0.8,
+      "grad_norm": 0.24609375,
+      "learning_rate": 1.1964614840949002e-05,
+      "loss": 0.18672417402267455,
+      "mean_token_accuracy": 0.9420441940426827,
+      "num_tokens": 683688.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.17232957119122147,
+      "epoch": 0.8177777777777778,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 1.0027861829824952e-05,
+      "loss": 0.16492395401000975,
+      "mean_token_accuracy": 0.9488118067383766,
+      "num_tokens": 699667.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.15511126685887575,
+      "epoch": 0.8355555555555556,
+      "grad_norm": 0.11328125,
+      "learning_rate": 8.244535977645585e-06,
+      "loss": 0.15686993598937987,
+      "mean_token_accuracy": 0.9512658596038819,
+      "num_tokens": 713516.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.1436025496572256,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 6.621482317764105e-06,
+      "loss": 0.13364784717559813,
+      "mean_token_accuracy": 0.9571717426180839,
+      "num_tokens": 729126.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.1506779951043427,
+      "epoch": 0.8711111111111111,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 5.164930702353782e-06,
+      "loss": 0.14193692207336425,
+      "mean_token_accuracy": 0.9549389965832233,
+      "num_tokens": 744962.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.1640948430635035,
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.3046875,
+      "learning_rate": 3.880471890038967e-06,
+      "loss": 0.16054811477661132,
+      "mean_token_accuracy": 0.9514626495540142,
+      "num_tokens": 759678.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.16496095675975084,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.234375,
+      "learning_rate": 2.7730360865923956e-06,
+      "loss": 0.16119725704193116,
+      "mean_token_accuracy": 0.9486441940069199,
+      "num_tokens": 774383.0,
+      "step": 510
+    },
+    {
+      "entropy": 0.17073705019429325,
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 1.8468740210672076e-06,
+      "loss": 0.1567433476448059,
+      "mean_token_accuracy": 0.950737326592207,
+      "num_tokens": 788196.0,
+      "step": 520
+    },
+    {
+      "entropy": 0.14311794554814697,
+      "epoch": 0.9422222222222222,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 1.1055406300002347e-06,
+      "loss": 0.13767447471618652,
+      "mean_token_accuracy": 0.9572608590126037,
+      "num_tokens": 803510.0,
+      "step": 530
+    },
+    {
+      "entropy": 0.1503830960020423,
+      "epoch": 0.96,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 5.518814123121885e-07,
+      "loss": 0.13956043720245362,
+      "mean_token_accuracy": 0.95562659278512,
+      "num_tokens": 819096.0,
+      "step": 540
+    },
+    {
+      "entropy": 0.14491902142763138,
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.134765625,
+      "learning_rate": 1.8802150727962876e-07,
+      "loss": 0.13078807592391967,
+      "mean_token_accuracy": 0.9562703162431717,
+      "num_tokens": 834121.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.1624174129217863,
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 1.5357537501159423e-08,
+      "loss": 0.15792603492736818,
+      "mean_token_accuracy": 0.9509337961673736,
+      "num_tokens": 850350.0,
+      "step": 560
+    },
+    {
+      "epoch": 1.0,
+      "eval_entropy": 0.17035142924636604,
+      "eval_loss": 0.16859714686870575,
+      "eval_mean_token_accuracy": 0.9493516847491265,
+      "eval_num_tokens": 854418.0,
+      "eval_runtime": 193.5414,
+      "eval_samples_per_second": 1.033,
+      "eval_steps_per_second": 1.033,
+      "step": 563
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 563,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.874711111049504e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-563/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc262f4d0e8c93b76e95a23291278880e2bee6bb924cab03eabbb9c30033301b
+size 5713

runs/Mar31_04-20-31_242ca503c882/events.out.tfevents.1774930831.242ca503c882.3260.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd800807729c96289422c048ba7b5103d581de1873ce792424d2d297f50aa3b
+size 10153

runs/Mar31_06-15-17_242ca503c882/events.out.tfevents.1774937717.242ca503c882.32504.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8912bd32da89f1d341f20edb5d40cfe81095db33a607e350bed88e7080fbb2de
+size 29629

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "backend": "tokenizers",
+  "boi_token": "<start_of_image>",
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eoi_token": "<end_of_image>",
+  "eos_token": "<eos>",
+  "image_token": "<image_soft_token>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "boi_token": "<start_of_image>",
+    "eoi_token": "<end_of_image>",
+    "image_token": "<image_soft_token>"
+  },
+  "pad_token": "<pad>",
+  "processor_class": "Gemma3Processor",
+  "sp_model_kwargs": null,
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddb27f7a670f95fa93dcb5c98c5d06a951526c8df56cf5945c5a6066148a8206
-size 5649

 version https://git-lfs.github.com/spec/v1
+oid sha256:bc262f4d0e8c93b76e95a23291278880e2bee6bb924cab03eabbb9c30033301b
+size 5713