Dat1710 commited on Jan 30

Commit

00db46c

verified ·

1 Parent(s): a5b1ef7

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.cursorignore +16 -0
.gitattributes +16 -0
.gitignore +15 -0
.python-version +1 -0
README.md +273 -0
data/grpo/test.csv +0 -0
data/grpo/train.csv +0 -0
data/sft/train.csv +0 -0
main.py +16 -0
models/grpo/README.md +72 -0
models/grpo/adapter_config.json +46 -0
models/grpo/adapter_model.safetensors +3 -0
models/grpo/added_tokens.json +24 -0
models/grpo/chat_template.jinja +54 -0
models/grpo/checkpoint-10/README.md +209 -0
models/grpo/checkpoint-10/adapter_config.json +46 -0
models/grpo/checkpoint-10/adapter_model.safetensors +3 -0
models/grpo/checkpoint-10/added_tokens.json +24 -0
models/grpo/checkpoint-10/chat_template.jinja +54 -0
models/grpo/checkpoint-10/merges.txt +0 -0
models/grpo/checkpoint-10/optimizer.pt +3 -0
models/grpo/checkpoint-10/rng_state.pth +3 -0
models/grpo/checkpoint-10/scheduler.pt +3 -0
models/grpo/checkpoint-10/special_tokens_map.json +31 -0
models/grpo/checkpoint-10/tokenizer.json +3 -0
models/grpo/checkpoint-10/tokenizer_config.json +209 -0
models/grpo/checkpoint-10/trainer_state.json +304 -0
models/grpo/checkpoint-10/training_args.bin +3 -0
models/grpo/checkpoint-10/vocab.json +0 -0
models/grpo/checkpoint-20/README.md +209 -0
models/grpo/checkpoint-20/adapter_config.json +46 -0
models/grpo/checkpoint-20/adapter_model.safetensors +3 -0
models/grpo/checkpoint-20/added_tokens.json +24 -0
models/grpo/checkpoint-20/chat_template.jinja +54 -0
models/grpo/checkpoint-20/merges.txt +0 -0
models/grpo/checkpoint-20/optimizer.pt +3 -0
models/grpo/checkpoint-20/rng_state.pth +3 -0
models/grpo/checkpoint-20/scheduler.pt +3 -0
models/grpo/checkpoint-20/special_tokens_map.json +31 -0
models/grpo/checkpoint-20/tokenizer.json +3 -0
models/grpo/checkpoint-20/tokenizer_config.json +209 -0
models/grpo/checkpoint-20/trainer_state.json +574 -0
models/grpo/checkpoint-20/training_args.bin +3 -0
models/grpo/checkpoint-20/vocab.json +0 -0
models/grpo/checkpoint-30/README.md +209 -0
models/grpo/checkpoint-30/adapter_config.json +46 -0
models/grpo/checkpoint-30/adapter_model.safetensors +3 -0
models/grpo/checkpoint-30/added_tokens.json +24 -0
models/grpo/checkpoint-30/chat_template.jinja +54 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.cursorignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Dataset
+data/
+# Environment variables
+.env

.gitattributes CHANGED Viewed

@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/grpo/checkpoint-10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/grpo/checkpoint-20/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/grpo/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/grpo/checkpoint-39/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/grpo/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-20/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-40/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-70/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-80/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/checkpoint-90/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+models/sft/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Dataset
+# Environment variables
+.env

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

	@@ -0,0 +1,273 @@

+# GRPO Countdown Problem
+A project for training language models to solve arithmetic countdown problems using Supervised Fine-Tuning (SFT) followed by Group Relative Policy Optimization (GRPO).
+## Overview
+This project implements a two-stage training pipeline:
+1. **SFT (Supervised Fine-Tuning)**: Train the model on arithmetic problems with correct solutions
+2. **GRPO (Group Relative Policy Optimization)**: Further optimize the model using reward-based learning
+The goal is to train a language model to solve arithmetic countdown problems where you must use exactly four given numbers with basic arithmetic operations (+, -, *, /) to reach a target value.
+## Project Structure
+```
+grpo-countdown-problem/
+├── data/                           # Training and test datasets
+├── models/                         # Saved model checkpoints
+│   ├── sft/                       # SFT model outputs
+│   └── grpo/                      # GRPO model outputs
+├── src/
+│   ├── config/                    # Configuration files
+│   │   ├── grpo/                  # GRPO training configs
+│   │   └── sft/                   # SFT training configs
+│   ├── dataset/                   # Dataset loading and processing
+│   ├── examples/                  # Example scripts for inference
+│   ├── scripts/                   # Data generation and processing
+│   ├── training/                  # Training scripts
+│   │   ├── grpo/                  # GRPO training
+│   │   └── sft/                   # SFT training
+│   └── utils/                     # Utility functions
+├── main.py                        # Main entry point
+├── pyproject.toml                 # Project dependencies
+└── README.md                      # This file
+```
+## Requirements
+- Python 3.12+
+- CUDA-capable GPU (recommended)
+- At least 8GB GPU memory for Qwen2.5-Math-1.5B model
+## Installation
+1. **Clone the repository:**
+   ```bash
+   git clone <repository-url>
+   cd grpo-countdown-problem
+   ```
+2. **Install dependencies using uv (recommended):**
+   ```bash
+   # Install uv if you haven't already
+   curl -LsSf https://astral.sh/uv/install.sh | sh
+   # Install project dependencies
+   uv sync
+   ```
+   **Or using pip:**
+   ```bash
+   pip install -e .
+   ```
+3. **Set up environment variables (if using OpenAI for data generation):**
+   ```bash
+   cp .env.example .env
+   # Edit .env and add your OpenAI API key
+   ```
+## Data Preparation
+### Generate Training Data
+1. **Generate SFT training data:**
+   ```bash
+   python src/scripts/generate_training_dataset_sft.py \
+     --output_path data/sft/train.csv \
+     --num_problems 10000 \
+     --num_workers 4
+   ```
+2. **Generate GRPO training data:**
+   ```bash
+   python src/scripts/generate_training_dataset_grpo.py \
+     --output_path data/grpo/train.csv \
+     --num_problems 10000 \
+     --num_workers 4
+   ```
+3. **Generate test data:**
+   ```bash
+   python src/scripts/generate_training_dataset_grpo.py \
+     --output_path data/grpo/test.csv \
+     --num_problems 1000 \
+     --num_workers 4
+   ```
+### Data Format
+The CSV files contain the following columns:
+- `id`: Unique problem identifier
+- `problem_description`: Natural language description of the problem
+- `correct_answer`: The target arithmetic expression
+- `num1`, `num2`, `num3`, `num4`: The four numbers to use
+- `reasoning` (SFT only): Step-by-step solution explanation
+## Training
+### Stage 1: Supervised Fine-Tuning (SFT)
+Train the base model on arithmetic problems with supervised learning:
+```bash
+python src/training/sft/train_sft_hydra.py
+```
+**Configuration:** The training uses Hydra configuration files in `src/config/sft/`:
+- `config.yaml`: Main configuration
+- `dataset/default.yaml`: Dataset settings
+- `model/qwen2.5-3b.yaml`: Model and LoRA settings
+- `training/default.yaml`: Training hyperparameters
+**Key parameters:**
+- Base model: `Qwen/Qwen2.5-Math-1.5B`
+- LoRA rank: 64
+- Learning rate: 2e-5
+- Batch size: 4 (per device)
+- Epochs: 2
+**Output:** Trained SFT model saved to `models/sft/`
+### Stage 2: Group Relative Policy Optimization (GRPO)
+Further optimize the SFT model using reward-based learning:
+```bash
+python src/training/grpo/train_grpo_hydra.py
+```
+**Configuration:** Uses Hydra configuration files in `src/config/grpo/`:
+- `config.yaml`: Main configuration (includes SFT model path)
+- `dataset/default.yaml`: Dataset settings
+- `model/qwen2.5-3b.yaml`: Model and LoRA settings
+- `training/default.yaml`: Training hyperparameters
+**Key parameters:**
+- Builds on SFT model from `models/sft/`
+- Learning rate: 1e-5
+- Batch size: 2 (per device)
+- Epochs: 1
+- Generations per prompt: 8
+- Reward function: Mathematical correctness
+**Output:** Trained GRPO model saved to `models/grpo/`
+### Custom Configuration
+You can override configuration parameters:
+```bash
+# Override dataset size
+python src/training/sft/train_sft_hydra.py dataset.max_rows=5000
+# Override learning rate and batch size
+python src/training/grpo/train_grpo_hydra.py \
+  training.learning_rate=5e-6 \
+  training.per_device_train_batch_size=1
+# Use different output directory
+python src/training/sft/train_sft_hydra.py output_dir=models/sft_experiment
+```
+## Inference
+### Interactive Problem Solving
+Use the trained model to solve individual problems:
+```bash
+python src/examples/run_model.py
+```
+This will load both SFT and GRPO models and solve a sample problem.
+### Batch Evaluation
+Evaluate model accuracy on a test dataset:
+```bash
+python src/examples/calculate_accuracy.py \
+  --csv_path data/grpo/test.csv \
+  --sft_model_path models/sft/ \
+  --grpo_model_path models/grpo/ \
+  --max_samples 100 \
+  --output_path results/evaluation_results.csv
+```
+**Parameters:**
+- `--csv_path`: Path to test CSV file
+- `--sft_model_path`: Path to SFT model directory
+- `--grpo_model_path`: Path to GRPO model directory
+- `--max_samples`: Limit number of test samples (optional)
+- `--output_path`: Save detailed results to CSV (optional)
+- `--temperature`: Sampling temperature (default: 1.0)
+- `--max_new_tokens`: Maximum tokens to generate (default: 4096)
+**Evaluation Metrics:**
+- **Accuracy**: Percentage of problems solved correctly
+- **Valid Format Rate**: Percentage of responses in valid arithmetic format
+- **Uses All Numbers Rate**: Percentage of responses using all four numbers
+### Model-only Evaluation
+Evaluate specific model stages:
+```bash
+# Evaluate only SFT model (no GRPO)
+python src/examples/calculate_accuracy.py \
+  --csv_path data/grpo/test.csv \
+  --sft_model_path models/sft/ \
+  --no_grpo
+# Evaluate only base model (no SFT or GRPO)
+python src/examples/calculate_accuracy.py \
+  --csv_path data/grpo/test.csv \
+  --no_sft --no_grpo
+```
+## Configuration Details
+### Model Configuration
+The project uses **Qwen2.5-Math-1.5B** as the base model with LoRA (Low-Rank Adaptation) for efficient fine-tuning:
+- **LoRA rank**: 64
+- **LoRA alpha**: 128
+- **Target modules**: All attention and MLP layers
+- **LoRA dropout**: 0.1
+### Training Configuration
+**SFT Training:**
+- **Optimizer**: AdamW 8-bit
+- **Learning rate**: 2e-5 with linear scheduler
+- **Warmup ratio**: 0.1
+- **Weight decay**: 0.01
+- **Max sequence length**: 4096
+**GRPO Training:**
+- **Optimizer**: AdamW 8-bit
+- **Learning rate**: 1e-5 with cosine scheduler
+- **Warmup ratio**: 0.1
+- **Weight decay**: 0.0
+- **Temperature**: 1.0
+- **Generations per prompt**: 8
+## Monitoring Training
+Both training scripts log to TensorBoard:
+```bash
+# View training logs
+tensorboard --logdir models/sft/runs    # For SFT training
+tensorboard --logdir models/grpo/runs   # For GRPO training
+```
+## Example Problem
+**Input:** "Use 53, 3, 47, and 36 exactly once each with only +, -, *, and / operators to create an expression equal to 133."
+**Expected Output:** A valid arithmetic expression like `53 + 47 + 36 - 3`

data/grpo/test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/grpo/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/sft/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from src.dataset import load_csv_dataset_sft
+from src.dataset.sft import map_problem_description_to_conversation_sft
+def main() -> None:
+    """
+    Main function
+    """
+    dataset = load_csv_dataset_sft(
+        "data/sft/train.csv", "train", map_problem_description_to_conversation_sft
+    )
+    print(dataset)
+if __name__ == "__main__":
+    main()

models/grpo/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+---
+base_model: Qwen/Qwen2.5-Math-1.5B
+library_name: peft
+model_name: grpo
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Math-1.5B
+- grpo
+- lora
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for grpo
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.27.1
+- Transformers: 4.57.6
+- Pytorch: 2.9.0+cu126
+- Datasets: 4.0.0
+- Tokenizers: 0.22.2
+## Citations
+Cite GRPO as:
+```bibtex
+@article{shao2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

models/grpo/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

models/grpo/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc40358071350f519ded257c36279a6fa9853841e6c46d6195183368d251a3b8
+size 295488936

models/grpo/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

models/grpo/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

models/grpo/checkpoint-10/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Math-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Math-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

models/grpo/checkpoint-10/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

models/grpo/checkpoint-10/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8805ac69de429cf7ae58ea71129a15643338f1c2bcd6d7b082681be8c2ad8ab
+size 295488936

models/grpo/checkpoint-10/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

models/grpo/checkpoint-10/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

models/grpo/checkpoint-10/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/grpo/checkpoint-10/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec347c3c649c09adad74a6491db474c4ecaf79f261b61274de3beec0ecb71324
+size 591203579

models/grpo/checkpoint-10/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6fee731f291b6810a59e3fcab5611a95ab2d788056c44e38f49e2ee1c723c47
+size 14645

models/grpo/checkpoint-10/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0f0dea0e393dc4c725bd0fc8c33f5112af6fe030518d1b34bdafde69682b890
+size 1465

models/grpo/checkpoint-10/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models/grpo/checkpoint-10/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
+size 11422063

models/grpo/checkpoint-10/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

models/grpo/checkpoint-10/trainer_state.json ADDED Viewed

	@@ -0,0 +1,304 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2564102564102564,
+  "eval_steps": 500,
+  "global_step": 10,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.80078125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 237.14453125,
+      "completions/mean_terminated_length": 161.3529510498047,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 1.3392305374145508,
+      "epoch": 0.02564102564102564,
+      "frac_reward_zero_std": 0.2890625,
+      "grad_norm": 0.07342787832021713,
+      "learning_rate": 0.0,
+      "loss": 0.0068,
+      "num_tokens": 153643.0,
+      "reward": 0.859375,
+      "reward_std": 1.100690484046936,
+      "rewards/mathematical_correctness_reward_function/mean": 0.859375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1006906032562256,
+      "step": 1,
+      "step_time": 86.24474567199923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.83984375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 239.8671875,
+      "completions/mean_terminated_length": 155.26828002929688,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "entropy": 1.1771463677287102,
+      "epoch": 0.05128205128205128,
+      "frac_reward_zero_std": 0.3515625,
+      "grad_norm": 0.06119547039270401,
+      "learning_rate": 2.5e-06,
+      "loss": 0.0204,
+      "num_tokens": 308163.0,
+      "reward": 0.88671875,
+      "reward_std": 1.3041635751724243,
+      "rewards/mathematical_correctness_reward_function/mean": 0.88671875,
+      "rewards/mathematical_correctness_reward_function/std": 1.3041635751724243,
+      "step": 2,
+      "step_time": 86.508452252001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.85546875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 244.81640625,
+      "completions/mean_terminated_length": 178.6216278076172,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.1922413893043995,
+      "epoch": 0.07692307692307693,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.056310124695301056,
+      "learning_rate": 5e-06,
+      "loss": 0.0119,
+      "num_tokens": 463868.0,
+      "reward": 0.90234375,
+      "reward_std": 1.124693512916565,
+      "rewards/mathematical_correctness_reward_function/mean": 0.90234375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1246936321258545,
+      "step": 3,
+      "step_time": 85.8330284170006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84765625,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 255.0,
+      "completions/mean_length": 243.02734375,
+      "completions/mean_terminated_length": 170.84616088867188,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "entropy": 1.2269665226340294,
+      "epoch": 0.10256410256410256,
+      "frac_reward_zero_std": 0.3984375,
+      "grad_norm": 0.05933857336640358,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.0091,
+      "num_tokens": 619011.0,
+      "reward": 0.91796875,
+      "reward_std": 1.1119270324707031,
+      "rewards/mathematical_correctness_reward_function/mean": 0.91796875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1119270324707031,
+      "step": 4,
+      "step_time": 85.85383133499909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.80859375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 235.5390625,
+      "completions/mean_terminated_length": 149.10203552246094,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.1289030984044075,
+      "epoch": 0.1282051282051282,
+      "frac_reward_zero_std": 0.3203125,
+      "grad_norm": 0.06574889272451401,
+      "learning_rate": 1e-05,
+      "loss": 0.0168,
+      "num_tokens": 772231.0,
+      "reward": 1.10546875,
+      "reward_std": 1.3549507856369019,
+      "rewards/mathematical_correctness_reward_function/mean": 1.10546875,
+      "rewards/mathematical_correctness_reward_function/std": 1.3549507856369019,
+      "step": 5,
+      "step_time": 85.96325700299985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.73828125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 233.44140625,
+      "completions/mean_terminated_length": 169.80596923828125,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "entropy": 1.2140139639377594,
+      "epoch": 0.15384615384615385,
+      "frac_reward_zero_std": 0.2734375,
+      "grad_norm": 0.06448275595903397,
+      "learning_rate": 9.979871469976197e-06,
+      "loss": 0.0187,
+      "num_tokens": 924882.0,
+      "reward": 1.21875,
+      "reward_std": 1.1127601861953735,
+      "rewards/mathematical_correctness_reward_function/mean": 1.21875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1127601861953735,
+      "step": 6,
+      "step_time": 86.28028441999868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 254.0,
+      "completions/mean_length": 241.78515625,
+      "completions/mean_terminated_length": 173.2954559326172,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "entropy": 1.241826605051756,
+      "epoch": 0.1794871794871795,
+      "frac_reward_zero_std": 0.2421875,
+      "grad_norm": 0.06542028486728668,
+      "learning_rate": 9.91964794299315e-06,
+      "loss": 0.0025,
+      "num_tokens": 1079615.0,
+      "reward": 1.29296875,
+      "reward_std": 1.1359151601791382,
+      "rewards/mathematical_correctness_reward_function/mean": 1.29296875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1359151601791382,
+      "step": 7,
+      "step_time": 85.96726177399978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7578125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 231.3046875,
+      "completions/mean_terminated_length": 154.03225708007812,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "entropy": 1.2537630051374435,
+      "epoch": 0.20512820512820512,
+      "frac_reward_zero_std": 0.3046875,
+      "grad_norm": 0.06004541367292404,
+      "learning_rate": 9.819814303479268e-06,
+      "loss": 0.0227,
+      "num_tokens": 1231799.0,
+      "reward": 1.33984375,
+      "reward_std": 1.1537383794784546,
+      "rewards/mathematical_correctness_reward_function/mean": 1.33984375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1537383794784546,
+      "step": 8,
+      "step_time": 85.93508993700198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7265625,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 250.0,
+      "completions/mean_length": 229.26953125,
+      "completions/mean_terminated_length": 158.24285888671875,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.1602798961102962,
+      "epoch": 0.23076923076923078,
+      "frac_reward_zero_std": 0.2421875,
+      "grad_norm": 0.07147325575351715,
+      "learning_rate": 9.681174353198687e-06,
+      "loss": 0.0138,
+      "num_tokens": 1383510.0,
+      "reward": 1.62890625,
+      "reward_std": 1.3191591501235962,
+      "rewards/mathematical_correctness_reward_function/mean": 1.62890625,
+      "rewards/mathematical_correctness_reward_function/std": 1.3191591501235962,
+      "step": 9,
+      "step_time": 86.19385715199951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 255.0,
+      "completions/mean_length": 222.59375,
+      "completions/mean_terminated_length": 154.1904754638672,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 1.2778590954840183,
+      "epoch": 0.2564102564102564,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.06555230915546417,
+      "learning_rate": 9.504844339512096e-06,
+      "loss": -0.0007,
+      "num_tokens": 1533384.0,
+      "reward": 1.66015625,
+      "reward_std": 1.3186944723129272,
+      "rewards/mathematical_correctness_reward_function/mean": 1.66015625,
+      "rewards/mathematical_correctness_reward_function/std": 1.3186945915222168,
+      "step": 10,
+      "step_time": 85.93042380499901
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 39,
+  "num_input_tokens_seen": 1533384,
+  "num_train_epochs": 1,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

models/grpo/checkpoint-10/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94f496ed302bcefe707e417d1679ee1a4274555a4b156df7e28e043c3e0d2bc9
+size 7697

models/grpo/checkpoint-10/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/grpo/checkpoint-20/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Math-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Math-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

models/grpo/checkpoint-20/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

models/grpo/checkpoint-20/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0263d2404cfa7d65bfc653ed760fec21598c9b2a8d457d505f9ede4d96afc60b
+size 295488936

models/grpo/checkpoint-20/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

models/grpo/checkpoint-20/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

models/grpo/checkpoint-20/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models/grpo/checkpoint-20/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1b405bf22e8d35e00bdea8755f8b483ed9461986e0da1f75439b4d8c3c000da
+size 591203579

models/grpo/checkpoint-20/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be3e0004889015c9c538d90e1b8ae6db4b3654cd8388cf501ef9ceb46cc5217f
+size 14645

models/grpo/checkpoint-20/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a65770a20cd6925dd627ba1022bf44d16624250db38e4590c6dc7768c69e2cb0
+size 1465

models/grpo/checkpoint-20/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

models/grpo/checkpoint-20/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
+size 11422063

models/grpo/checkpoint-20/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": null
+}

models/grpo/checkpoint-20/trainer_state.json ADDED Viewed

	@@ -0,0 +1,574 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5128205128205128,
+  "eval_steps": 500,
+  "global_step": 20,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.80078125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 237.14453125,
+      "completions/mean_terminated_length": 161.3529510498047,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 1.3392305374145508,
+      "epoch": 0.02564102564102564,
+      "frac_reward_zero_std": 0.2890625,
+      "grad_norm": 0.07342787832021713,
+      "learning_rate": 0.0,
+      "loss": 0.0068,
+      "num_tokens": 153643.0,
+      "reward": 0.859375,
+      "reward_std": 1.100690484046936,
+      "rewards/mathematical_correctness_reward_function/mean": 0.859375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1006906032562256,
+      "step": 1,
+      "step_time": 86.24474567199923
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.83984375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 239.8671875,
+      "completions/mean_terminated_length": 155.26828002929688,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "entropy": 1.1771463677287102,
+      "epoch": 0.05128205128205128,
+      "frac_reward_zero_std": 0.3515625,
+      "grad_norm": 0.06119547039270401,
+      "learning_rate": 2.5e-06,
+      "loss": 0.0204,
+      "num_tokens": 308163.0,
+      "reward": 0.88671875,
+      "reward_std": 1.3041635751724243,
+      "rewards/mathematical_correctness_reward_function/mean": 0.88671875,
+      "rewards/mathematical_correctness_reward_function/std": 1.3041635751724243,
+      "step": 2,
+      "step_time": 86.508452252001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.85546875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 253.0,
+      "completions/mean_length": 244.81640625,
+      "completions/mean_terminated_length": 178.6216278076172,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "entropy": 1.1922413893043995,
+      "epoch": 0.07692307692307693,
+      "frac_reward_zero_std": 0.375,
+      "grad_norm": 0.056310124695301056,
+      "learning_rate": 5e-06,
+      "loss": 0.0119,
+      "num_tokens": 463868.0,
+      "reward": 0.90234375,
+      "reward_std": 1.124693512916565,
+      "rewards/mathematical_correctness_reward_function/mean": 0.90234375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1246936321258545,
+      "step": 3,
+      "step_time": 85.8330284170006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84765625,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 255.0,
+      "completions/mean_length": 243.02734375,
+      "completions/mean_terminated_length": 170.84616088867188,
+      "completions/min_length": 82.0,
+      "completions/min_terminated_length": 82.0,
+      "entropy": 1.2269665226340294,
+      "epoch": 0.10256410256410256,
+      "frac_reward_zero_std": 0.3984375,
+      "grad_norm": 0.05933857336640358,
+      "learning_rate": 7.500000000000001e-06,
+      "loss": 0.0091,
+      "num_tokens": 619011.0,
+      "reward": 0.91796875,
+      "reward_std": 1.1119270324707031,
+      "rewards/mathematical_correctness_reward_function/mean": 0.91796875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1119270324707031,
+      "step": 4,
+      "step_time": 85.85383133499909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.80859375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 235.5390625,
+      "completions/mean_terminated_length": 149.10203552246094,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 1.1289030984044075,
+      "epoch": 0.1282051282051282,
+      "frac_reward_zero_std": 0.3203125,
+      "grad_norm": 0.06574889272451401,
+      "learning_rate": 1e-05,
+      "loss": 0.0168,
+      "num_tokens": 772231.0,
+      "reward": 1.10546875,
+      "reward_std": 1.3549507856369019,
+      "rewards/mathematical_correctness_reward_function/mean": 1.10546875,
+      "rewards/mathematical_correctness_reward_function/std": 1.3549507856369019,
+      "step": 5,
+      "step_time": 85.96325700299985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.73828125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 233.44140625,
+      "completions/mean_terminated_length": 169.80596923828125,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "entropy": 1.2140139639377594,
+      "epoch": 0.15384615384615385,
+      "frac_reward_zero_std": 0.2734375,
+      "grad_norm": 0.06448275595903397,
+      "learning_rate": 9.979871469976197e-06,
+      "loss": 0.0187,
+      "num_tokens": 924882.0,
+      "reward": 1.21875,
+      "reward_std": 1.1127601861953735,
+      "rewards/mathematical_correctness_reward_function/mean": 1.21875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1127601861953735,
+      "step": 6,
+      "step_time": 86.28028441999868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 254.0,
+      "completions/mean_length": 241.78515625,
+      "completions/mean_terminated_length": 173.2954559326172,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "entropy": 1.241826605051756,
+      "epoch": 0.1794871794871795,
+      "frac_reward_zero_std": 0.2421875,
+      "grad_norm": 0.06542028486728668,
+      "learning_rate": 9.91964794299315e-06,
+      "loss": 0.0025,
+      "num_tokens": 1079615.0,
+      "reward": 1.29296875,
+      "reward_std": 1.1359151601791382,
+      "rewards/mathematical_correctness_reward_function/mean": 1.29296875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1359151601791382,
+      "step": 7,
+      "step_time": 85.96726177399978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7578125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 231.3046875,
+      "completions/mean_terminated_length": 154.03225708007812,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "entropy": 1.2537630051374435,
+      "epoch": 0.20512820512820512,
+      "frac_reward_zero_std": 0.3046875,
+      "grad_norm": 0.06004541367292404,
+      "learning_rate": 9.819814303479268e-06,
+      "loss": 0.0227,
+      "num_tokens": 1231799.0,
+      "reward": 1.33984375,
+      "reward_std": 1.1537383794784546,
+      "rewards/mathematical_correctness_reward_function/mean": 1.33984375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1537383794784546,
+      "step": 8,
+      "step_time": 85.93508993700198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.7265625,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 250.0,
+      "completions/mean_length": 229.26953125,
+      "completions/mean_terminated_length": 158.24285888671875,
+      "completions/min_length": 51.0,
+      "completions/min_terminated_length": 51.0,
+      "entropy": 1.1602798961102962,
+      "epoch": 0.23076923076923078,
+      "frac_reward_zero_std": 0.2421875,
+      "grad_norm": 0.07147325575351715,
+      "learning_rate": 9.681174353198687e-06,
+      "loss": 0.0138,
+      "num_tokens": 1383510.0,
+      "reward": 1.62890625,
+      "reward_std": 1.3191591501235962,
+      "rewards/mathematical_correctness_reward_function/mean": 1.62890625,
+      "rewards/mathematical_correctness_reward_function/std": 1.3191591501235962,
+      "step": 9,
+      "step_time": 86.19385715199951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 255.0,
+      "completions/mean_length": 222.59375,
+      "completions/mean_terminated_length": 154.1904754638672,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 1.2778590954840183,
+      "epoch": 0.2564102564102564,
+      "frac_reward_zero_std": 0.25,
+      "grad_norm": 0.06555230915546417,
+      "learning_rate": 9.504844339512096e-06,
+      "loss": -0.0007,
+      "num_tokens": 1533384.0,
+      "reward": 1.66015625,
+      "reward_std": 1.3186944723129272,
+      "rewards/mathematical_correctness_reward_function/mean": 1.66015625,
+      "rewards/mathematical_correctness_reward_function/std": 1.3186945915222168,
+      "step": 10,
+      "step_time": 85.93042380499901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.66796875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 254.0,
+      "completions/mean_length": 219.453125,
+      "completions/mean_terminated_length": 145.92941284179688,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "entropy": 1.2968238294124603,
+      "epoch": 0.28205128205128205,
+      "frac_reward_zero_std": 0.3046875,
+      "grad_norm": 0.0640135407447815,
+      "learning_rate": 9.292243968009332e-06,
+      "loss": 0.0186,
+      "num_tokens": 1682622.0,
+      "reward": 1.64453125,
+      "reward_std": 1.355674147605896,
+      "rewards/mathematical_correctness_reward_function/mean": 1.64453125,
+      "rewards/mathematical_correctness_reward_function/std": 1.355674147605896,
+      "step": 11,
+      "step_time": 86.73103137099861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.70703125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 227.4921875,
+      "completions/mean_terminated_length": 158.69334411621094,
+      "completions/min_length": 53.0,
+      "completions/min_terminated_length": 53.0,
+      "entropy": 1.1798847801983356,
+      "epoch": 0.3076923076923077,
+      "frac_reward_zero_std": 0.234375,
+      "grad_norm": 0.06765235960483551,
+      "learning_rate": 9.045084971874738e-06,
+      "loss": 0.0153,
+      "num_tokens": 1833938.0,
+      "reward": 1.74609375,
+      "reward_std": 1.2469509840011597,
+      "rewards/mathematical_correctness_reward_function/mean": 1.74609375,
+      "rewards/mathematical_correctness_reward_function/std": 1.2469509840011597,
+      "step": 12,
+      "step_time": 85.93055903300046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.72265625,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 255.0,
+      "completions/mean_length": 227.6640625,
+      "completions/mean_terminated_length": 153.8309783935547,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 1.0747726932168007,
+      "epoch": 0.3333333333333333,
+      "frac_reward_zero_std": 0.265625,
+      "grad_norm": 0.06723812222480774,
+      "learning_rate": 8.765357330018056e-06,
+      "loss": 0.0142,
+      "num_tokens": 1985174.0,
+      "reward": 1.92578125,
+      "reward_std": 1.4164161682128906,
+      "rewards/mathematical_correctness_reward_function/mean": 1.92578125,
+      "rewards/mathematical_correctness_reward_function/std": 1.4164161682128906,
+      "step": 13,
+      "step_time": 86.18083131499952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 216.9453125,
+      "completions/mean_terminated_length": 136.9761962890625,
+      "completions/min_length": 19.0,
+      "completions/min_terminated_length": 19.0,
+      "entropy": 1.048094429075718,
+      "epoch": 0.358974358974359,
+      "frac_reward_zero_std": 0.2890625,
+      "grad_norm": 0.06610685586929321,
+      "learning_rate": 8.455313244934324e-06,
+      "loss": 0.015,
+      "num_tokens": 2133660.0,
+      "reward": 1.8671875,
+      "reward_std": 1.1740400791168213,
+      "rewards/mathematical_correctness_reward_function/mean": 1.8671875,
+      "rewards/mathematical_correctness_reward_function/std": 1.1740400791168213,
+      "step": 14,
+      "step_time": 86.0845494640007
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6171875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 250.0,
+      "completions/mean_length": 207.58984375,
+      "completions/mean_terminated_length": 129.5408172607422,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "entropy": 1.1446622498333454,
+      "epoch": 0.38461538461538464,
+      "frac_reward_zero_std": 0.2734375,
+      "grad_norm": 0.07915058732032776,
+      "learning_rate": 8.117449009293668e-06,
+      "loss": 0.0438,
+      "num_tokens": 2279705.0,
+      "reward": 2.02734375,
+      "reward_std": 1.1960861682891846,
+      "rewards/mathematical_correctness_reward_function/mean": 2.02734375,
+      "rewards/mathematical_correctness_reward_function/std": 1.1960861682891846,
+      "step": 15,
+      "step_time": 86.29595133900057
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6328125,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 252.0,
+      "completions/mean_length": 209.8671875,
+      "completions/mean_terminated_length": 130.3616943359375,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "entropy": 1.0856149233877659,
+      "epoch": 0.41025641025641024,
+      "frac_reward_zero_std": 0.2421875,
+      "grad_norm": 0.07183024287223816,
+      "learning_rate": 7.754484907260513e-06,
+      "loss": 0.0186,
+      "num_tokens": 2426473.0,
+      "reward": 2.00390625,
+      "reward_std": 1.272632122039795,
+      "rewards/mathematical_correctness_reward_function/mean": 2.00390625,
+      "rewards/mathematical_correctness_reward_function/std": 1.272632122039795,
+      "step": 16,
+      "step_time": 86.06944787800421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 249.0,
+      "completions/mean_length": 203.3203125,
+      "completions/mean_terminated_length": 126.3269271850586,
+      "completions/min_length": 42.0,
+      "completions/min_terminated_length": 42.0,
+      "entropy": 1.1112723909318447,
+      "epoch": 0.4358974358974359,
+      "frac_reward_zero_std": 0.3203125,
+      "grad_norm": 0.07727660983800888,
+      "learning_rate": 7.369343312364994e-06,
+      "loss": 0.0017,
+      "num_tokens": 2571503.0,
+      "reward": 2.3515625,
+      "reward_std": 1.4637573957443237,
+      "rewards/mathematical_correctness_reward_function/mean": 2.3515625,
+      "rewards/mathematical_correctness_reward_function/std": 1.4637575149536133,
+      "step": 17,
+      "step_time": 86.03456121599811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.57421875,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 251.0,
+      "completions/mean_length": 200.82421875,
+      "completions/mean_terminated_length": 126.412841796875,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "entropy": 1.0709168612957,
+      "epoch": 0.46153846153846156,
+      "frac_reward_zero_std": 0.3671875,
+      "grad_norm": 0.07580733299255371,
+      "learning_rate": 6.965125158269619e-06,
+      "loss": 0.023,
+      "num_tokens": 2716018.0,
+      "reward": 2.24609375,
+      "reward_std": 1.2295321226119995,
+      "rewards/mathematical_correctness_reward_function/mean": 2.24609375,
+      "rewards/mathematical_correctness_reward_function/std": 1.229532241821289,
+      "step": 18,
+      "step_time": 86.06337975900169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5859375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 203.3125,
+      "completions/mean_terminated_length": 128.7547149658203,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "entropy": 1.0283005349338055,
+      "epoch": 0.48717948717948717,
+      "frac_reward_zero_std": 0.421875,
+      "grad_norm": 0.06546252965927124,
+      "learning_rate": 6.545084971874738e-06,
+      "loss": 0.0157,
+      "num_tokens": 2861190.0,
+      "reward": 2.3203125,
+      "reward_std": 1.2008728981018066,
+      "rewards/mathematical_correctness_reward_function/mean": 2.3203125,
+      "rewards/mathematical_correctness_reward_function/std": 1.2008728981018066,
+      "step": 19,
+      "step_time": 86.31491269100479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5234375,
+      "completions/max_length": 256.0,
+      "completions/max_terminated_length": 256.0,
+      "completions/mean_length": 190.8125,
+      "completions/mean_terminated_length": 119.2131118774414,
+      "completions/min_length": 43.0,
+      "completions/min_terminated_length": 43.0,
+      "entropy": 1.0640101097524166,
+      "epoch": 0.5128205128205128,
+      "frac_reward_zero_std": 0.3359375,
+      "grad_norm": 0.07372903823852539,
+      "learning_rate": 6.112604669781572e-06,
+      "loss": 0.0409,
+      "num_tokens": 3002934.0,
+      "reward": 2.54296875,
+      "reward_std": 1.3038816452026367,
+      "rewards/mathematical_correctness_reward_function/mean": 2.54296875,
+      "rewards/mathematical_correctness_reward_function/std": 1.3038816452026367,
+      "step": 20,
+      "step_time": 85.70871102700039
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 39,
+  "num_input_tokens_seen": 3002934,
+  "num_train_epochs": 1,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

models/grpo/checkpoint-20/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94f496ed302bcefe707e417d1679ee1a4274555a4b156df7e28e043c3e0d2bc9
+size 7697

models/grpo/checkpoint-20/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/grpo/checkpoint-30/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-Math-1.5B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-Math-1.5B
+- grpo
+- lora
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

models/grpo/checkpoint-30/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

models/grpo/checkpoint-30/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8fea088efffa75b8bfd6d8833442fa89adcccd16446f59fc7a2a6db8de203bf
+size 295488936

models/grpo/checkpoint-30/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

models/grpo/checkpoint-30/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'Please reason step by step, and put your final answer within \\boxed{}.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}