ZaandaTeika commited on Dec 30, 2025

Commit

ce0ad6f

verified ·

1 Parent(s): e0879a4

Convert model to bfloat16 and fix total_parameters metadata

Browse files

Files changed (19) hide show

.gitattributes +1 -0
README.md +214 -0
added_tokens.json +24 -0
chat_template.jinja +54 -0
config.json +66 -0
merges.txt +0 -0
model-00001-of-00006.safetensors +3 -0
model-00002-of-00006.safetensors +3 -0
model-00003-of-00006.safetensors +3 -0
model-00004-of-00006.safetensors +3 -0
model-00005-of-00006.safetensors +3 -0
model-00006-of-00006.safetensors +3 -0
model.safetensors.index.json +348 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +207 -0
trainer_state.json +403 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,214 @@

+---
+base_model: Qwen/Qwen2.5-Math-7B-Instruct
+library_name: transformers
+model_name: Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM
+tags:
+- generated_from_trainer
+- prm
+- trl
+- math
+- process-reward-model
+- qwen2.5
+- sharp
+---
+# Model Card for Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM
+## Introduction
+**Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM** is a Process Reward Model (PRM) fine-tuned from [Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct). This model is specifically designed to evaluate the correctness of intermediate reasoning steps in mathematical problem-solving processes, enabling more reliable and interpretable mathematical reasoning.
+The model has been trained on the **SHARP-Math** dataset using the Process Reward Model methodology, which provides step-by-step feedback on mathematical reasoning chains.
+This model is part of the SHARP-PRM series, trained using advanced Process Reward Model techniques.
+## Model Information
+### Base Model
+- **Base Model**: [Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)
+- **Architecture**: Qwen2ForTokenClassification
+- **Parameters**: 7B
+### Training Details
+- **Training Dataset**: SHARP-Math (Process Reward Model dataset)
+- **Training Method**: Process Reward Model (PRM) as introduced in [Uesato et al., 2022](https://huggingface.co/papers/2211.14275)
+- **Training Framework**: [TRL (Transformer Reinforcement Learning)](https://github.com/huggingface/trl) v0.24.0
+- **Task Type**: Token Classification (binary classification: error/correct for each reasoning step)
+## PRM Evaluation
+This model is designed to evaluate mathematical reasoning processes by:
+1. **Step-level Evaluation**: Classifying each step in a reasoning chain as either "correct" or "error"
+2. **Process Feedback**: Providing feedback on the reasoning process, not just the final answer
+3. **Error Detection**: Identifying where mistakes occur in multi-step mathematical solutions
+### Evaluation Metrics
+The model is evaluated on the [ProcessBench](https://huggingface.co/datasets/Qwen/ProcessBench) benchmark.
+Key metrics include:
+- **Error Accuracy**: Ability to correctly identify incorrect steps
+- **Correct Accuracy**: Ability to correctly identify correct steps
+- **F1 Score**: Balanced measure of error and correct step classification
+## Quick Start
+### Installation
+```bash
+pip install transformers torch
+```
+### Basic Usage
+#### Using the Model for Step Classification
+```python
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+import torch
+import torch.nn.functional as F
+model_name = "path/to/Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+model.eval()
+# Example: Evaluate a mathematical reasoning chain
+# Problem with steps (one correct, one incorrect)
+problem = "Solve: 2x + 5 = 13"
+steps = [
+    "Subtract 5 from both sides: 2x = 8",  # Correct step
+    "Divide by 2: x = 5"  # Incorrect step (should be x = 4)
+]
+# Format input with step separator
+input_text = problem + "\n\n" + "\n\n".join(steps)
+inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=8192)
+# Get model predictions
+with torch.no_grad():
+    outputs = model(**inputs)
+    logits = outputs.logits  # Shape: [batch_size, sequence_length, num_labels]
+    probabilities = F.softmax(logits, dim=-1)  # Convert to probabilities
+    predictions = torch.argmax(logits, dim=-1)  # Get predicted class indices
+# Aggregate predictions per step
+# In practice, you would map tokens to steps based on your step separator
+labels = ["error", "correct"]
+for i, step in enumerate(steps):
+    # Get average probability for step tokens (simplified)
+    # In real usage, you'd need to map token positions to step boundaries
+    step_start = len(tokenizer(problem + "\n\n", return_tensors="pt")["input_ids"][0])
+    step_tokens = predictions[0, step_start:step_start+len(tokenizer(step)["input_ids"])]
+    step_label = labels[step_tokens.mode().values.item()] if len(step_tokens) > 0 else "unknown"
+    print(f"\nStep {i+1}: {step}")
+    print(f"  Prediction: {step_label}")
+    print(f"  Confidence: {probabilities[0, step_start, 1].item():.2%}")
+# Expected output:
+# Step 1: Subtract 5 from both sides: 2x = 8
+#   Prediction: correct
+#   Confidence: 0.95
+#
+# Step 2: Divide by 2: x = 5
+#   Prediction: error
+#   Confidence: 0.87
+```
+**Output Interpretation:**
+- **Logits**: Raw scores from the model (before softmax). Higher values indicate stronger confidence.
+- **Probabilities**: Softmax-normalized scores between 0 and 1. Sum to 1 for each token.
+- **Predictions**: Class indices (0 = "error", 1 = "correct") for each token.
+#### Using with Pipeline
+```python
+from transformers import pipeline
+classifier = pipeline(
+    "token-classification",
+    model="path/to/Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM",
+    tokenizer="path/to/Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM",
+    device=0 if torch.cuda.is_available() else -1
+)
+# Classify reasoning steps
+result = classifier(problem + "\n\n" + "\n\n".join(steps))
+```
+### Integration with Mathematical Reasoning
+This PRM model can be used to:
+1. **Filter incorrect reasoning paths** in tree-of-thought or chain-of-thought generation
+2. **Provide feedback** during step-by-step problem solving
+3. **Evaluate solution quality** before final answer generation
+4. **Improve training** by identifying problematic reasoning patterns
+## Training Procedure
+### Training Configuration
+- **Learning Rate**: 2e-5
+- **Batch Size**: Per-device batch size (with gradient accumulation)
+- **Epochs**: Multiple epochs with early stopping
+- **Optimizer**: AdamW with cosine learning rate schedule
+- **Warmup Ratio**: 3%
+- **Gradient Clipping**: 5.0
+- **Precision**: bfloat16
+- **Gradient Checkpointing**: Enabled for memory efficiency
+### Training Framework Versions
+- **TRL**: 0.24.0
+- **Transformers**: 4.56.2
+- **PyTorch**: 2.9.1
+- **Datasets**: 4.4.1
+- **Tokenizers**: 0.22.1
+### Training Data
+The model was trained on the **SHARP-Math** dataset, which contains:
+- Mathematical problems with step-by-step solutions
+- Labeled reasoning steps (correct/error)
+- Diverse mathematical domains and difficulty levels
+## Use Cases
+### 1. Mathematical Reasoning Evaluation
+- Evaluate intermediate steps in mathematical problem-solving
+- Identify errors in multi-step calculations
+- Provide feedback on reasoning quality
+### 2. Educational Applications
+- Automated grading of mathematical solutions
+- Step-by-step feedback for students
+- Identification of common error patterns
+### 3. Research Applications
+- Training better mathematical reasoning models
+- Analyzing reasoning patterns
+- Improving chain-of-thought generation
+## Limitations and Considerations
+1. **Domain Specificity**: This model is specifically trained for mathematical reasoning and may not generalize well to other domains
+2. **Step Length**: The model is optimized for step-level evaluation with a 256-token context per step
+3. **Language**: The model is primarily trained on English mathematical content
+4. **False Positives/Negatives**: Like all classification models, it may misclassify some steps
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@misc{qwen2.5-math-7b-instruct-sharp-math-prm,
+  title={Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM: A Process Reward Model for Mathematical Reasoning},
+  author={Your Name/Organization},
+  year={2025},
+  howpublished={\url{https://huggingface.co/path/to/Qwen2.5-Math-7B-Instruct-SHARP-Math-PRM}}
+}
+```
+**Model Card Version**: 1.0
+**Last Updated**: 2025-12-30

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen2ForTokenClassification"
+  ],
+  "attention_dropout": 0.0,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "id2label": {
+    "0": "error",
+    "1": "correct"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "label2id": {
+    "correct": 1,
+    "error": 0
+  },
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 4096,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7a594d0bb52ad438379e44774ac6ef717a66c6c2ea805749e80d9cacb6d0cbb
+size 2488345720

model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0e8b8b7a9ccab7ff66746f0f6d930c911fb7a64f040064e67110b679d44801
+size 2389314968

model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f31f613d4851b61ec56169ba24e1615bb32f0c9e6bebfbb6a4ae1f18abb866d
+size 2466375456

model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fca1299e6719cc3e511fb50626fae14f06fca5fcf3b99cad8ccc83ba09723292
+size 2466375480

model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c1457afae0268ced9c2c942b20355616f457c4bb90333d9cf7fe811ebacbcb2
+size 2499430096

model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98b8ec06e0ee8f202e8e4b21b66a91e180d42c3c4db33d53f82bc428ff667e29
+size 1831449540

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,348 @@

+{
+  "metadata": {
+    "total_parameters": 7070645630,
+    "total_size": 14141291260
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00004-of-00006.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00006.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00005-of-00006.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00006-of-00006.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00006.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00006.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.norm.weight": "model-00006-of-00006.safetensors",
+    "score.bias": "model-00006-of-00006.safetensors",
+    "score.weight": "model-00006-of-00006.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,403 @@

+{
+  "best_global_step": null,
+  "best_metric": 0.9240172831045173,
+  "best_model_checkpoint": null,
+  "epoch": 1.263157894736842,
+  "eval_steps": 16,
+  "global_step": 96,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05263157894736842,
+      "grad_norm": 82.16344451904297,
+      "learning_rate": 5e-06,
+      "loss": 3.8674,
+      "step": 4
+    },
+    {
+      "epoch": 0.10526315789473684,
+      "grad_norm": 242.86280822753906,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 3.6908,
+      "step": 8
+    },
+    {
+      "epoch": 0.10526315789473684,
+      "eval_F1_err_corr": 0.5920247495103442,
+      "eval_accuracy": 0.6310377809660449,
+      "eval_correct_accuracy": 0.5115720882394246,
+      "eval_error_accuracy": 0.7025044997996679,
+      "eval_f1": 0.5395404356908385,
+      "eval_loss": 0.8285790681838989,
+      "eval_pr_auc": 0.5723630396883582,
+      "eval_precision": 0.3904967602591793,
+      "eval_recall": 0.8725868725868726,
+      "eval_runtime": 23.3512,
+      "eval_samples_per_second": 49.976,
+      "eval_steps_per_second": 0.428,
+      "step": 8
+    },
+    {
+      "epoch": 0.15789473684210525,
+      "grad_norm": 74.22534942626953,
+      "learning_rate": 1.8333333333333333e-05,
+      "loss": 2.5223,
+      "step": 12
+    },
+    {
+      "epoch": 0.21052631578947367,
+      "grad_norm": 64.61672973632812,
+      "learning_rate": 1.99967206113942e-05,
+      "loss": 1.9821,
+      "step": 16
+    },
+    {
+      "epoch": 0.21052631578947367,
+      "eval_F1_err_corr": 0.8456369286942332,
+      "eval_accuracy": 0.8366810138689622,
+      "eval_correct_accuracy": 0.898511262557064,
+      "eval_error_accuracy": 0.7986396970115458,
+      "eval_f1": 0.6812879141390574,
+      "eval_loss": 0.3859586715698242,
+      "eval_pr_auc": 0.7321071153896515,
+      "eval_precision": 0.6594399277326106,
+      "eval_recall": 0.7046332046332047,
+      "eval_runtime": 23.1639,
+      "eval_samples_per_second": 50.38,
+      "eval_steps_per_second": 0.432,
+      "step": 16
+    },
+    {
+      "epoch": 0.2631578947368421,
+      "grad_norm": 56.03799057006836,
+      "learning_rate": 1.9982149887948264e-05,
+      "loss": 1.787,
+      "step": 20
+    },
+    {
+      "epoch": 0.3157894736842105,
+      "grad_norm": 43.37980651855469,
+      "learning_rate": 1.995594042425798e-05,
+      "loss": 1.4845,
+      "step": 24
+    },
+    {
+      "epoch": 0.3157894736842105,
+      "eval_F1_err_corr": 0.8872989542610511,
+      "eval_accuracy": 0.8730272596843616,
+      "eval_correct_accuracy": 0.9604554888142675,
+      "eval_error_accuracy": 0.8244981013888578,
+      "eval_f1": 0.730593607305936,
+      "eval_loss": 0.30173778533935547,
+      "eval_pr_auc": 0.7963278967873059,
+      "eval_precision": 0.7700534759358288,
+      "eval_recall": 0.694980694980695,
+      "eval_runtime": 23.2199,
+      "eval_samples_per_second": 50.259,
+      "eval_steps_per_second": 0.431,
+      "step": 24
+    },
+    {
+      "epoch": 0.3684210526315789,
+      "grad_norm": 10.989681243896484,
+      "learning_rate": 1.99181227793856e-05,
+      "loss": 1.2391,
+      "step": 28
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "grad_norm": 19.56873893737793,
+      "learning_rate": 1.9868741047013382e-05,
+      "loss": 1.4107,
+      "step": 32
+    },
+    {
+      "epoch": 0.42105263157894735,
+      "eval_F1_err_corr": 0.8722315874998673,
+      "eval_accuracy": 0.8766140602582496,
+      "eval_correct_accuracy": 0.9661095393156461,
+      "eval_error_accuracy": 0.7949822655705009,
+      "eval_f1": 0.7024221453287197,
+      "eval_loss": 0.29268908500671387,
+      "eval_pr_auc": 0.8449069976179278,
+      "eval_precision": 0.8724928366762178,
+      "eval_recall": 0.5878378378378378,
+      "eval_runtime": 23.2018,
+      "eval_samples_per_second": 50.298,
+      "eval_steps_per_second": 0.431,
+      "step": 32
+    },
+    {
+      "epoch": 0.47368421052631576,
+      "grad_norm": 26.890634536743164,
+      "learning_rate": 1.9807852804032306e-05,
+      "loss": 1.4247,
+      "step": 36
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "grad_norm": 22.178682327270508,
+      "learning_rate": 1.9735529043410012e-05,
+      "loss": 1.1194,
+      "step": 40
+    },
+    {
+      "epoch": 0.5263157894736842,
+      "eval_F1_err_corr": 0.9029221553098332,
+      "eval_accuracy": 0.9050693448110951,
+      "eval_correct_accuracy": 0.9454399276155002,
+      "eval_error_accuracy": 0.8640639748307817,
+      "eval_f1": 0.7975522692503825,
+      "eval_loss": 0.239300936460495,
+      "eval_pr_auc": 0.872941982874099,
+      "eval_precision": 0.8454054054054054,
+      "eval_recall": 0.7548262548262549,
+      "eval_runtime": 23.1821,
+      "eval_samples_per_second": 50.341,
+      "eval_steps_per_second": 0.431,
+      "step": 40
+    },
+    {
+      "epoch": 0.5789473684210527,
+      "grad_norm": 8.334005355834961,
+      "learning_rate": 1.9651854091416175e-05,
+      "loss": 1.0329,
+      "step": 44
+    },
+    {
+      "epoch": 0.631578947368421,
+      "grad_norm": 20.910919189453125,
+      "learning_rate": 1.9556925509301844e-05,
+      "loss": 0.9956,
+      "step": 48
+    },
+    {
+      "epoch": 0.631578947368421,
+      "eval_F1_err_corr": 0.8796785116369793,
+      "eval_accuracy": 0.8935915829746532,
+      "eval_correct_accuracy": 0.9475924160656984,
+      "eval_error_accuracy": 0.8208483486319621,
+      "eval_f1": 0.7593293672255274,
+      "eval_loss": 0.25272682309150696,
+      "eval_pr_auc": 0.8738089681286181,
+      "eval_precision": 0.8634686346863468,
+      "eval_recall": 0.6776061776061776,
+      "eval_runtime": 23.1794,
+      "eval_samples_per_second": 50.346,
+      "eval_steps_per_second": 0.431,
+      "step": 48
+    },
+    {
+      "epoch": 0.6842105263157895,
+      "grad_norm": 22.64931869506836,
+      "learning_rate": 1.9450853979547384e-05,
+      "loss": 0.9445,
+      "step": 52
+    },
+    {
+      "epoch": 0.7368421052631579,
+      "grad_norm": 9.232457160949707,
+      "learning_rate": 1.9333763176811663e-05,
+      "loss": 0.9956,
+      "step": 56
+    },
+    {
+      "epoch": 0.7368421052631579,
+      "eval_F1_err_corr": 0.9132652179840324,
+      "eval_accuracy": 0.9122429459588713,
+      "eval_correct_accuracy": 0.930750326857197,
+      "eval_error_accuracy": 0.8964249469238965,
+      "eval_f1": 0.8259838786154575,
+      "eval_loss": 0.21841835975646973,
+      "eval_pr_auc": 0.8865448688581825,
+      "eval_precision": 0.8117427772600186,
+      "eval_recall": 0.8407335907335908,
+      "eval_runtime": 23.1656,
+      "eval_samples_per_second": 50.376,
+      "eval_steps_per_second": 0.432,
+      "step": 56
+    },
+    {
+      "epoch": 0.7894736842105263,
+      "grad_norm": 11.506294250488281,
+      "learning_rate": 1.9205789623732923e-05,
+      "loss": 0.9999,
+      "step": 60
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 14.85898494720459,
+      "learning_rate": 1.9067082531749496e-05,
+      "loss": 0.9873,
+      "step": 64
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "eval_F1_err_corr": 0.9074790316918309,
+      "eval_accuracy": 0.9057867049258728,
+      "eval_correct_accuracy": 0.9265510387266113,
+      "eval_error_accuracy": 0.8891763406206684,
+      "eval_f1": 0.8187672493100276,
+      "eval_loss": 0.23048239946365356,
+      "eval_pr_auc": 0.8888016792056976,
+      "eval_precision": 0.7820738137082601,
+      "eval_recall": 0.859073359073359,
+      "eval_runtime": 23.1547,
+      "eval_samples_per_second": 50.4,
+      "eval_steps_per_second": 0.432,
+      "step": 64
+    },
+    {
+      "epoch": 0.8947368421052632,
+      "grad_norm": 12.407254219055176,
+      "learning_rate": 1.891780362712594e-05,
+      "loss": 0.9402,
+      "step": 68
+    },
+    {
+      "epoch": 0.9473684210526315,
+      "grad_norm": 5.819011211395264,
+      "learning_rate": 1.875812696238745e-05,
+      "loss": 0.9055,
+      "step": 72
+    },
+    {
+      "epoch": 0.9473684210526315,
+      "eval_F1_err_corr": 0.9134620670871267,
+      "eval_accuracy": 0.9120038259206121,
+      "eval_correct_accuracy": 0.9335912631332478,
+      "eval_error_accuracy": 0.894182563484034,
+      "eval_f1": 0.8270676691729323,
+      "eval_loss": 0.2183985859155655,
+      "eval_pr_auc": 0.8924188619420418,
+      "eval_precision": 0.8058608058608059,
+      "eval_recall": 0.8494208494208494,
+      "eval_runtime": 23.3258,
+      "eval_samples_per_second": 50.03,
+      "eval_steps_per_second": 0.429,
+      "step": 72
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 12.771453857421875,
+      "learning_rate": 1.85882387133824e-05,
+      "loss": 0.9761,
+      "step": 76
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "grad_norm": 7.193549156188965,
+      "learning_rate": 1.840833696220963e-05,
+      "loss": 0.499,
+      "step": 80
+    },
+    {
+      "epoch": 1.0526315789473684,
+      "eval_F1_err_corr": 0.9079950528722031,
+      "eval_accuracy": 0.9098517455762793,
+      "eval_correct_accuracy": 0.9236024182207389,
+      "eval_error_accuracy": 0.8929063998654336,
+      "eval_f1": 0.824406148113647,
+      "eval_loss": 0.2152077704668045,
+      "eval_pr_auc": 0.8973152787211761,
+      "eval_precision": 0.7965796579657966,
+      "eval_recall": 0.8542471042471043,
+      "eval_runtime": 23.1673,
+      "eval_samples_per_second": 50.373,
+      "eval_steps_per_second": 0.432,
+      "step": 80
+    },
+    {
+      "epoch": 1.1052631578947367,
+      "grad_norm": 5.080722332000732,
+      "learning_rate": 1.8218631466263584e-05,
+      "loss": 0.4177,
+      "step": 84
+    },
+    {
+      "epoch": 1.1578947368421053,
+      "grad_norm": 5.394437313079834,
+      "learning_rate": 1.801934341366655e-05,
+      "loss": 0.3945,
+      "step": 88
+    },
+    {
+      "epoch": 1.1578947368421053,
+      "eval_F1_err_corr": 0.9097585355242964,
+      "eval_accuracy": 0.9158297465327594,
+      "eval_correct_accuracy": 0.9179299538459843,
+      "eval_error_accuracy": 0.9017313175664017,
+      "eval_f1": 0.8367346938775511,
+      "eval_loss": 0.23682163655757904,
+      "eval_pr_auc": 0.8999781537818953,
+      "eval_precision": 0.8053571428571429,
+      "eval_recall": 0.8706563706563707,
+      "eval_runtime": 23.1454,
+      "eval_samples_per_second": 50.42,
+      "eval_steps_per_second": 0.432,
+      "step": 88
+    },
+    {
+      "epoch": 1.2105263157894737,
+      "grad_norm": 5.82737398147583,
+      "learning_rate": 1.7810705165373245e-05,
+      "loss": 0.3592,
+      "step": 92
+    },
+    {
+      "epoch": 1.263157894736842,
+      "grad_norm": 7.944681167602539,
+      "learning_rate": 1.75929599842483e-05,
+      "loss": 0.395,
+      "step": 96
+    },
+    {
+      "epoch": 1.263157894736842,
+      "eval_F1_err_corr": 0.9240172831045173,
+      "eval_accuracy": 0.9179818268770923,
+      "eval_correct_accuracy": 0.9473624612073788,
+      "eval_error_accuracy": 0.9017949916846976,
+      "eval_f1": 0.8381311939594148,
+      "eval_loss": 0.2285495400428772,
+      "eval_pr_auc": 0.8914048907254241,
+      "eval_precision": 0.8199445983379502,
+      "eval_recall": 0.8571428571428571,
+      "eval_runtime": 23.1568,
+      "eval_samples_per_second": 50.396,
+      "eval_steps_per_second": 0.432,
+      "step": 96
+    }
+  ],
+  "logging_steps": 4,
+  "max_steps": 380,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 16,
+  "stateful_callbacks": {
+    "MinEpochEarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.001
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.380768046815642e+16,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abf8904fc64133fc5bd10f58e42b2b3012e65d67af99094a3a6fbb24562fb6f8
+size 6097

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff