init

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +206 -0
added_tokens.json +28 -0
chat_template.jinja +5 -0
config.json +29 -0
generation_config.json +7 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +239 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,206 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: cyberbabooshka/base_noreasoning
+tags:
+- axolotl
+- generated_from_trainer
+datasets:
+- cyberbabooshka/MNLP_M2_mcqa_dataset
+model-index:
+- name: MNLP_M2_mcqa_model
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.10.0.dev0`
+```yaml
+base_model: cyberbabooshka/base_noreasoning
+hub_model_id: cyberbabooshka/MNLP_M2_mcqa_model
+wandb_name: base
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+load_in_4bit: false
+num_processes: 64
+dataset_processes: 64
+dataset_prepared_path: last_run_prepared
+chat_template: jinja
+chat_template_jinja: >-
+  {%- for message in messages %}
+    {{- message.content.strip('\n') + '\n' }}
+  {%- endfor %}
+  {%- if not add_generation_prompt %}
+    {{- '<|im_end|>' }}
+  {%- endif %}
+datasets:
+  - path: cyberbabooshka/MNLP_M2_mcqa_dataset
+    name: cooldown
+    split: train
+    type: chat_template
+    chat_template: tokenizer_default
+    field_messages: messages
+    train_on_eos: all
+    train_on_eot: all
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      user:
+        - user
+      assistant:
+        - assistant
+test_datasets:
+  - path: cyberbabooshka/MNLP_M2_mcqa_dataset
+    name: mcqa
+    split: test
+    type: chat_template
+    chat_template: tokenizer_default
+    field_messages: messages
+    train_on_eos: all
+    train_on_eot: all
+    message_property_mappings:
+      role: role
+      content: content
+    roles:
+      user:
+        - user
+      assistant:
+        - assistant
+output_dir: ./outputs_mcqa
+sequence_len: 2048
+batch_flattening: true
+sample_packing: false
+wandb_project: mnlp
+wandb_entity: aleksandr-dremov-epfl
+wandb_watch:
+wandb_log_model:
+gradient_accumulation_steps: 1
+eval_batch_size: 16
+micro_batch_size: 12
+optimizer: ademamix_8bit
+weight_decay: 0.01
+learning_rate: 0.00001
+warmup_steps: 100
+wsd_final_lr_factor: 0.0
+wsd_init_div_factor: 100
+wsd_fract_decay: 0.2
+wsd_decay_type: "sqrt"
+wsd_sqrt_power: 0.5
+wsd_cooldown_start_lr_factor: 1.0
+bf16: auto
+tf32: false
+torch_compile: true
+flash_attention: true
+gradient_checkpointing: false
+resume_from_checkpoint:
+auto_resume_from_checkpoints: true
+logging_steps: 16
+eval_steps: 500
+save_steps: 500
+max_steps: 1000000
+num_epochs: 1
+save_total_limit: 2
+special_tokens:
+  eos_token: "<|im_end|>"
+  pad_token: "<|endoftext|>"
+eot_tokens:
+  - <|im_end|>
+plugins:
+  - axolotl_wsd.WSDSchedulerPlugin
+```
+</details><br>
+# MNLP_M2_mcqa_model
+This model is a fine-tuned version of [cyberbabooshka/base_noreasoning](https://huggingface.co/cyberbabooshka/base_noreasoning) on the cyberbabooshka/MNLP_M2_mcqa_dataset dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.6772
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 1e-05
+- train_batch_size: 12
+- eval_batch_size: 16
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- total_train_batch_size: 24
+- total_eval_batch_size: 32
+- optimizer: Use OptimizerNames.ADEMAMIX_8BIT and the args are:
+No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- training_steps: 8438
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| No log        | 0.0001 | 1    | 2.2371          |
+| 0.8956        | 0.0593 | 500  | 0.7674          |
+| 0.9093        | 0.1185 | 1000 | 0.7335          |
+| 0.8544        | 0.1778 | 1500 | 0.7159          |
+| 0.8503        | 0.2370 | 2000 | 0.7074          |
+| 0.8781        | 0.2963 | 2500 | 0.7016          |
+| 0.8171        | 0.3555 | 3000 | 0.6968          |
+| 0.9179        | 0.4148 | 3500 | 0.6930          |
+| 0.845         | 0.4740 | 4000 | 0.6895          |
+| 0.8885        | 0.5333 | 4500 | 0.6865          |
+| 0.9432        | 0.5926 | 5000 | 0.6844          |
+| 0.7451        | 0.6518 | 5500 | 0.6825          |
+| 0.8675        | 0.7111 | 6000 | 0.6811          |
+| 0.8606        | 0.7703 | 6500 | 0.6793          |
+| 0.8602        | 0.8000 | 6750 | 0.6793          |
+| 0.8458        | 0.8296 | 7000 | 0.6778          |
+| 0.9051        | 0.8888 | 7500 | 0.6772          |
+| 0.8589        | 0.9481 | 8000 | 0.6772          |
+### Framework versions
+- Transformers 4.52.1
+- Pytorch 2.7.0+cu126
+- Datasets 3.5.0
+- Tokenizers 0.21.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,5 @@

+{%- for message in messages %}
+  {{- message.content.strip('\n') + '\n' }}
+{%- endfor %} {%- if not add_generation_prompt %}
+  {{- '<|im_end|>' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.1",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.52.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2523da081dfc0472229adc37faf3dc17332dee1af725c955bc2d7eee869f764
+size 1192135096

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4ca5e476a8b74b7c8d92f81e7730dc002dff80064f8c3f75a9c838671deb775
+size 7185

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff