{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d1d2de78-7197-4f6a-b690-78a424d6b326", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting torch==2.8.0 (from -r requirements.txt (line 1))\n", " Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)\n", "Collecting transformers==5.0.0 (from -r requirements.txt (line 2))\n", " Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)\n", "Collecting datasets==4.5.0 (from -r requirements.txt (line 3))\n", " Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)\n", "Collecting numpy==2.3.5 (from -r requirements.txt (line 4))\n", " Downloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.1/62.1 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml==6.0.3 in /usr/local/lib/python3.12/dist-packages (from -r requirements.txt (line 5)) (6.0.3)\n", "Collecting triton==3.4.0 (from -r requirements.txt (line 6))\n", " Downloading triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)\n", "Collecting muon-optimizer==0.1.0 (from -r requirements.txt (line 7))\n", " Downloading muon_optimizer-0.1.0-py3-none-any.whl.metadata (5.1 kB)\n", "Collecting filelock (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading filelock-3.21.2-py3-none-any.whl.metadata (2.0 kB)\n", "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.12/dist-packages (from torch==2.8.0->-r requirements.txt (line 1)) (4.15.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch==2.8.0->-r requirements.txt (line 1)) (80.9.0)\n", "Collecting sympy>=1.13.3 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)\n", "Collecting networkx (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch==2.8.0->-r requirements.txt (line 1)) (3.1.6)\n", "Collecting fsspec (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading fsspec-2026.2.0-py3-none-any.whl.metadata (10 kB)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)\n", "Collecting nvidia-cublas-cu12==12.8.4.1 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cufft-cu12==11.3.3.83 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-curand-cu12==10.3.9.90 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cusolver-cu12==11.7.3.90 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)\n", "Collecting nvidia-cusparse-cu12==12.5.8.93 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)\n", "Collecting nvidia-cusparselt-cu12==0.7.1 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl.metadata (7.0 kB)\n", "Collecting nvidia-nccl-cu12==2.27.3 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)\n", "Collecting nvidia-nvtx-cu12==12.8.90 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)\n", "Collecting nvidia-nvjitlink-cu12==12.8.93 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-cufile-cu12==1.13.1.3 (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)\n", "Collecting huggingface-hub<2.0,>=1.3.0 (from transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from transformers==5.0.0->-r requirements.txt (line 2)) (25.0)\n", "Collecting regex!=2019.12.17 (from transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting tokenizers<=0.23.0,>=0.22.0 (from transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)\n", "Collecting typer-slim (from transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading typer_slim-0.23.1-py3-none-any.whl.metadata (4.2 kB)\n", "Collecting safetensors>=0.4.3 (from transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.12/dist-packages (from transformers==5.0.0->-r requirements.txt (line 2)) (4.67.1)\n", "Collecting pyarrow>=21.0.0 (from datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n", "Collecting dill<0.4.1,>=0.3.0 (from datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)\n", "Collecting pandas (from datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.12/dist-packages (from datasets==4.5.0->-r requirements.txt (line 3)) (2.32.5)\n", "Requirement already satisfied: httpx<1.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets==4.5.0->-r requirements.txt (line 3)) (0.28.1)\n", "Collecting xxhash (from datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)\n", "Collecting multiprocess<0.70.19 (from datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)\n", "Collecting fsspec (from torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)\n", "Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)\n", "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->datasets==4.5.0->-r requirements.txt (line 3)) (4.12.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->datasets==4.5.0->-r requirements.txt (line 3)) (2025.11.12)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->datasets==4.5.0->-r requirements.txt (line 3)) (1.0.9)\n", "Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1.0.0->datasets==4.5.0->-r requirements.txt (line 3)) (3.11)\n", "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1.0.0->datasets==4.5.0->-r requirements.txt (line 3)) (0.16.0)\n", "Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=1.3.0->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n", "Collecting shellingham (from huggingface-hub<2.0,>=1.3.0->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets==4.5.0->-r requirements.txt (line 3)) (3.4.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests>=2.32.2->datasets==4.5.0->-r requirements.txt (line 3)) (2.6.2)\n", "Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch==2.8.0->-r requirements.txt (line 1))\n", " Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch==2.8.0->-r requirements.txt (line 1)) (3.0.3)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets==4.5.0->-r requirements.txt (line 3)) (2.9.0.post0)\n", "Collecting typer>=0.23.1 (from typer-slim->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading typer-0.23.1-py3-none-any.whl.metadata (16 kB)\n", "Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)\n", "Collecting aiosignal>=1.4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3)) (25.4.0)\n", "Collecting frozenlist>=1.1.1 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (20 kB)\n", "Collecting multidict<7.0,>=4.5 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.3 kB)\n", "Collecting propcache>=0.2.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)\n", "Collecting yarl<2.0,>=1.17.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets==4.5.0->-r requirements.txt (line 3))\n", " Downloading yarl-1.22.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (75 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.1/75.1 kB\u001b[0m \u001b[31m25.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets==4.5.0->-r requirements.txt (line 3)) (1.17.0)\n", "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2)) (8.3.1)\n", "Collecting rich>=10.11.0 (from typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading rich-14.3.2-py3-none-any.whl.metadata (18 kB)\n", "Collecting annotated-doc>=0.0.2 (from typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading annotated_doc-0.0.4-py3-none-any.whl.metadata (6.6 kB)\n", "Collecting markdown-it-py>=2.2.0 (from rich>=10.11.0->typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=10.11.0->typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2)) (2.19.2)\n", "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.23.1->typer-slim->transformers==5.0.0->-r requirements.txt (line 2))\n", " Downloading mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n", "Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl (887.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m887.9/887.9 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading transformers-5.0.0-py3-none-any.whl (10.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.1/10.1 MB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", "\u001b[?25hDownloading datasets-4.5.0-py3-none-any.whl (515 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m515.2/515.2 kB\u001b[0m \u001b[31m33.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.6/16.6 MB\u001b[0m \u001b[31m27.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (155.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m155.6/155.6 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading muon_optimizer-0.1.0-py3-none-any.whl (7.1 kB)\n", "Downloading nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl (594.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m594.3/594.3 MB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (10.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.2/10.2 MB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (88.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.0/88.0 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (954 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m954.8/954.8 kB\u001b[0m \u001b[31m29.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl (706.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m706.8/706.8 MB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (193.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.1/193.1 MB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl (63.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.6/63.6 MB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl (267.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m267.5/267.5 MB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (288.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m288.2/288.2 MB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl (287.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.2/287.2 MB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_nccl_cu12-2.27.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m322.4/322.4 MB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.3/39.3 MB\u001b[0m \u001b[31m22.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.0/90.0 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading dill-0.4.0-py3-none-any.whl (119 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading fsspec-2025.10.0-py3-none-any.whl (200 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m201.0/201.0 kB\u001b[0m \u001b[31m26.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading huggingface_hub-1.4.1-py3-none-any.whl (553 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m553.3/553.3 kB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading multiprocess-0.70.18-py312-none-any.whl (150 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m150.3/150.3 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.6/47.6 MB\u001b[0m \u001b[31m22.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (803 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m803.6/803.6 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (507 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.2/507.2 kB\u001b[0m \u001b[31m26.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading tokenizers-0.22.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading filelock-3.21.2-py3-none-any.whl (21 kB)\n", "Downloading networkx-3.6.1-py3-none-any.whl (2.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (10.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/10.9 MB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", "\u001b[?25hDownloading typer_slim-0.23.1-py3-none-any.whl (3.4 kB)\n", "Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (193 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.9/193.9 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.3/3.3 MB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m536.2/536.2 kB\u001b[0m \u001b[31m28.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading typer-0.23.1-py3-none-any.whl (56 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m19.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)\n", "Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)\n", "Downloading aiosignal-1.4.0-py3-none-any.whl (7.5 kB)\n", "Downloading annotated_doc-0.0.4-py3-none-any.whl (5.3 kB)\n", "Downloading frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl (242 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m242.4/242.4 kB\u001b[0m \u001b[31m27.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (256 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m256.3/256.3 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading propcache-0.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (221 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m221.6/221.6 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading rich-14.3.2-py3-none-any.whl (309 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.0/310.0 kB\u001b[0m \u001b[31m25.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading yarl-1.22.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (377 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m377.3/377.3 kB\u001b[0m \u001b[31m28.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading markdown_it_py-4.0.0-py3-none-any.whl (87 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.3/87.3 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", "Installing collected packages: nvidia-cusparselt-cu12, muon-optimizer, mpmath, xxhash, triton, sympy, shellingham, safetensors, regex, pyarrow, propcache, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufile-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, multidict, mdurl, hf-xet, fsspec, frozenlist, filelock, dill, annotated-doc, aiohappyeyeballs, yarl, pandas, nvidia-cusparse-cu12, nvidia-cufft-cu12, nvidia-cudnn-cu12, multiprocess, markdown-it-py, aiosignal, rich, nvidia-cusolver-cu12, aiohttp, typer, torch, typer-slim, huggingface-hub, tokenizers, datasets, transformers\n", " Attempting uninstall: numpy\n", " Found existing installation: numpy 2.4.0\n", " Uninstalling numpy-2.4.0:\n", " Successfully uninstalled numpy-2.4.0\n", "Successfully installed aiohappyeyeballs-2.6.1 aiohttp-3.13.3 aiosignal-1.4.0 annotated-doc-0.0.4 datasets-4.5.0 dill-0.4.0 filelock-3.21.2 frozenlist-1.8.0 fsspec-2025.10.0 hf-xet-1.2.0 huggingface-hub-1.4.1 markdown-it-py-4.0.0 mdurl-0.1.2 mpmath-1.3.0 multidict-6.7.1 multiprocess-0.70.18 muon-optimizer-0.1.0 networkx-3.6.1 numpy-2.3.5 nvidia-cublas-cu12-12.8.4.1 nvidia-cuda-cupti-cu12-12.8.90 nvidia-cuda-nvrtc-cu12-12.8.93 nvidia-cuda-runtime-cu12-12.8.90 nvidia-cudnn-cu12-9.10.2.21 nvidia-cufft-cu12-11.3.3.83 nvidia-cufile-cu12-1.13.1.3 nvidia-curand-cu12-10.3.9.90 nvidia-cusolver-cu12-11.7.3.90 nvidia-cusparse-cu12-12.5.8.93 nvidia-cusparselt-cu12-0.7.1 nvidia-nccl-cu12-2.27.3 nvidia-nvjitlink-cu12-12.8.93 nvidia-nvtx-cu12-12.8.90 pandas-3.0.0 propcache-0.4.1 pyarrow-23.0.0 regex-2026.1.15 rich-14.3.2 safetensors-0.7.0 shellingham-1.5.4 sympy-1.14.0 tokenizers-0.22.2 torch-2.8.0 transformers-5.0.0 triton-3.4.0 typer-0.23.1 typer-slim-0.23.1 xxhash-3.6.0 yarl-1.22.0\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": 2, "id": "7ee74299-0bb9-4600-8d0c-7249da0a4613", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading (incomplete total...): 0.00B [00:00, ?B/s]\n", "Fetching 8 files: 0%| | 0/8 [00:00flash-attn) (3.21.2)\n", "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (4.15.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (80.9.0)\n", "Requirement already satisfied: sympy>=1.13.3 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (1.14.0)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (3.6.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (3.1.6)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (2025.10.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.93)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.90)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.90)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.10.2.21 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (9.10.2.21)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.8.4.1 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.4.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.3.3.83 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (11.3.3.83)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.9.90 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (10.3.9.90)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.7.3.90 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (11.7.3.90)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.5.8.93 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.5.8.93)\n", "Requirement already satisfied: nvidia-cusparselt-cu12==0.7.1 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (0.7.1)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.27.3 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (2.27.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.8.90 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.90)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.8.93 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (12.8.93)\n", "Requirement already satisfied: nvidia-cufile-cu12==1.13.1.3 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (1.13.1.3)\n", "Requirement already satisfied: triton==3.4.0 in /usr/local/lib/python3.12/dist-packages (from torch->flash-attn) (3.4.0)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.12/dist-packages (from sympy>=1.13.3->torch->flash-attn) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->torch->flash-attn) (3.0.3)\n", "Downloading einops-0.8.2-py3-none-any.whl (65 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.6/65.6 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: flash-attn\n", " Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=256040057 sha256=f25da18657a87fc83dc1bfb8b7751b82246e9db355510226b674fd437c34b5fb\n", " Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27\n", "Successfully built flash-attn\n", "Installing collected packages: einops, flash-attn\n", "Successfully installed einops-0.8.2 flash-attn-2.8.3\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install flash-attn --no-build-isolation" ] }, { "cell_type": "code", "execution_count": 4, "id": "13591c30-caf0-4f54-98bc-2a855912b11a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "W0213 17:07:15.494000 3084 torch/distributed/run.py:774] \n", "W0213 17:07:15.494000 3084 torch/distributed/run.py:774] *****************************************\n", "W0213 17:07:15.494000 3084 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", "W0213 17:07:15.494000 3084 torch/distributed/run.py:774] *****************************************\n", "======================================================================\n", "FreqFormer Training (8 GPUs, DDP)\n", "======================================================================\n", "Model: 33.2M parameters\n", "Wrapped with DDP (8 GPUs)\n", "Train: 12,311,556,581 tokens, 751437 batches of T=16384\n", "Val: 252,032,358 tokens, 15382 batches of T=16384\n", "Epoch-based training: 1 epochs x 7827 steps = 7827 total\n", "Optimizer: muon (multi-gpu)\n", " muon: 16.8M\n", " adamw: 16.4M\n", " total: 33.2M\n", "\n", "Training: steps 0→7827, B=3x8gpu, T=16384, GA=4\n", "Effective batch: 1,572,864 tokens/step\n", "Device: NVIDIA GeForce RTX 5090\n", "======================================================================\n", " step 20 | loss 7.3877 | ppl 1616.0 | 1033.6K tok/s | lr 1.90e-02 | VRAM 31448MB (gpu0)\n", " step 40 | loss 5.6588 | ppl 286.8 | 1619.3K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 60 | loss 4.7175 | ppl 111.9 | 1615.2K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 80 | loss 4.1389 | ppl 62.7 | 1611.2K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 100 | loss 3.4918 | ppl 32.8 | 1610.3K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 120 | loss 3.1136 | ppl 22.5 | 1610.6K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 140 | loss 2.9185 | ppl 18.5 | 1609.9K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 160 | loss 2.8241 | ppl 16.8 | 1609.2K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 180 | loss 2.7651 | ppl 15.9 | 1608.4K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 200 | loss 2.6475 | ppl 14.1 | 1607.7K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 220 | loss 2.5714 | ppl 13.1 | 1606.7K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 240 | loss 2.5392 | ppl 12.7 | 1592.1K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 260 | loss 2.5628 | ppl 13.0 | 1606.6K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 280 | loss 2.3898 | ppl 10.9 | 1607.6K tok/s | lr 2.00e-02 | VRAM 31448MB (gpu0)\n", " step 300 | loss 2.3881 | ppl 10.9 | 1608.6K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 320 | loss 2.4982 | ppl 12.2 | 1609.1K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 340 | loss 2.3649 | ppl 10.6 | 1610.0K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 360 | loss 2.3867 | ppl 10.9 | 1609.8K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 380 | loss 2.2988 | ppl 10.0 | 1610.1K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 400 | loss 2.4299 | ppl 11.4 | 1610.2K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 420 | loss 2.3464 | ppl 10.4 | 1610.2K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 440 | loss 2.3137 | ppl 10.1 | 1610.4K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 460 | loss 2.2432 | ppl 9.4 | 1609.7K tok/s | lr 1.99e-02 | VRAM 31448MB (gpu0)\n", " step 480 | loss 2.2175 | ppl 9.2 | 1609.8K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 500 | loss 2.3113 | ppl 10.1 | 1609.8K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 520 | loss 2.1864 | ppl 8.9 | 1610.2K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 540 | loss 2.1855 | ppl 8.9 | 1610.2K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 560 | loss 2.2751 | ppl 9.7 | 1609.2K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 580 | loss 2.2938 | ppl 9.9 | 1608.4K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 600 | loss 2.3173 | ppl 10.1 | 1607.5K tok/s | lr 1.98e-02 | VRAM 31448MB (gpu0)\n", " step 620 | loss 2.2853 | ppl 9.8 | 1606.5K tok/s | lr 1.97e-02 | VRAM 31448MB (gpu0)\n", " step 640 | loss 2.2063 | ppl 9.1 | 1605.6K tok/s | lr 1.97e-02 | VRAM 31448MB (gpu0)\n", " step 660 | loss 2.2203 | ppl 9.2 | 1605.1K tok/s | lr 1.97e-02 | VRAM 31448MB (gpu0)\n", " step 680 | loss 2.2987 | ppl 10.0 | 1605.4K tok/s | lr 1.97e-02 | VRAM 31448MB (gpu0)\n", " step 700 | loss 2.1448 | ppl 8.5 | 1605.7K tok/s | lr 1.97e-02 | VRAM 31448MB (gpu0)\n", " step 720 | loss 2.3711 | ppl 10.7 | 1606.7K tok/s | lr 1.96e-02 | VRAM 31448MB (gpu0)\n", " step 740 | loss 2.2080 | ppl 9.1 | 1607.7K tok/s | lr 1.96e-02 | VRAM 31448MB (gpu0)\n", " step 760 | loss 2.2420 | ppl 9.4 | 1609.2K tok/s | lr 1.96e-02 | VRAM 31448MB (gpu0)\n", " step 780 | loss 2.1888 | ppl 8.9 | 1610.1K tok/s | lr 1.96e-02 | VRAM 31448MB (gpu0)\n", " step 800 | loss 2.1784 | ppl 8.8 | 1609.9K tok/s | lr 1.96e-02 | VRAM 31448MB (gpu0)\n", " step 820 | loss 2.3054 | ppl 10.0 | 1609.9K tok/s | lr 1.95e-02 | VRAM 31448MB (gpu0)\n", " step 840 | loss 2.2558 | ppl 9.5 | 1610.0K tok/s | lr 1.95e-02 | VRAM 31448MB (gpu0)\n", " step 860 | loss 2.2279 | ppl 9.3 | 1610.0K tok/s | lr 1.95e-02 | VRAM 31448MB (gpu0)\n", " step 880 | loss 2.2719 | ppl 9.7 | 1609.9K tok/s | lr 1.95e-02 | VRAM 31448MB (gpu0)\n", " step 900 | loss 2.1498 | ppl 8.6 | 1610.1K tok/s | lr 1.94e-02 | VRAM 31448MB (gpu0)\n", " step 920 | loss 2.2597 | ppl 9.6 | 1610.4K tok/s | lr 1.94e-02 | VRAM 31448MB (gpu0)\n", " step 940 | loss 2.2762 | ppl 9.7 | 1610.4K tok/s | lr 1.94e-02 | VRAM 31448MB (gpu0)\n", " step 960 | loss 2.1138 | ppl 8.3 | 1610.0K tok/s | lr 1.94e-02 | VRAM 31448MB (gpu0)\n", " step 980 | loss 2.2565 | ppl 9.5 | 1610.5K tok/s | lr 1.93e-02 | VRAM 31448MB (gpu0)\n", " step 1000 | loss 2.1593 | ppl 8.7 | 1610.1K tok/s | lr 1.93e-02 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 35.8 ★ best\n", " step 1020 | loss 2.2227 | ppl 9.2 | 1290.4K tok/s | lr 1.93e-02 | VRAM 31448MB (gpu0)\n", " step 1040 | loss 2.2249 | ppl 9.3 | 1606.6K tok/s | lr 1.93e-02 | VRAM 31448MB (gpu0)\n", " step 1060 | loss 2.2821 | ppl 9.8 | 1605.8K tok/s | lr 1.92e-02 | VRAM 31448MB (gpu0)\n", " step 1080 | loss 2.2336 | ppl 9.3 | 1605.5K tok/s | lr 1.92e-02 | VRAM 31448MB (gpu0)\n", " step 1100 | loss 2.1880 | ppl 8.9 | 1605.7K tok/s | lr 1.92e-02 | VRAM 31448MB (gpu0)\n", " step 1120 | loss 2.1484 | ppl 8.6 | 1605.7K tok/s | lr 1.91e-02 | VRAM 31448MB (gpu0)\n", " step 1140 | loss 2.0647 | ppl 7.9 | 1606.0K tok/s | lr 1.91e-02 | VRAM 31448MB (gpu0)\n", " step 1160 | loss 2.1376 | ppl 8.5 | 1606.3K tok/s | lr 1.91e-02 | VRAM 31448MB (gpu0)\n", " step 1180 | loss 2.1218 | ppl 8.3 | 1607.5K tok/s | lr 1.90e-02 | VRAM 31448MB (gpu0)\n", " step 1200 | loss 2.2513 | ppl 9.5 | 1608.7K tok/s | lr 1.90e-02 | VRAM 31448MB (gpu0)\n", " step 1220 | loss 2.1479 | ppl 8.6 | 1609.4K tok/s | lr 1.90e-02 | VRAM 31448MB (gpu0)\n", " step 1240 | loss 2.2206 | ppl 9.2 | 1610.1K tok/s | lr 1.89e-02 | VRAM 31448MB (gpu0)\n", " step 1260 | loss 2.1238 | ppl 8.4 | 1610.4K tok/s | lr 1.89e-02 | VRAM 31448MB (gpu0)\n", " step 1280 | loss 2.0637 | ppl 7.9 | 1609.9K tok/s | lr 1.89e-02 | VRAM 31448MB (gpu0)\n", " step 1300 | loss 2.1668 | ppl 8.7 | 1610.2K tok/s | lr 1.88e-02 | VRAM 31448MB (gpu0)\n", " step 1320 | loss 2.0113 | ppl 7.5 | 1610.3K tok/s | lr 1.88e-02 | VRAM 31448MB (gpu0)\n", " step 1340 | loss 2.1544 | ppl 8.6 | 1610.6K tok/s | lr 1.88e-02 | VRAM 31448MB (gpu0)\n", " step 1360 | loss 2.0265 | ppl 7.6 | 1610.7K tok/s | lr 1.87e-02 | VRAM 31448MB (gpu0)\n", " step 1380 | loss 2.2100 | ppl 9.1 | 1610.6K tok/s | lr 1.87e-02 | VRAM 31448MB (gpu0)\n", " step 1400 | loss 2.1654 | ppl 8.7 | 1610.4K tok/s | lr 1.86e-02 | VRAM 31448MB (gpu0)\n", " step 1420 | loss 2.1057 | ppl 8.2 | 1610.2K tok/s | lr 1.86e-02 | VRAM 31448MB (gpu0)\n", " step 1440 | loss 2.1347 | ppl 8.5 | 1610.0K tok/s | lr 1.86e-02 | VRAM 31448MB (gpu0)\n", " step 1460 | loss 2.1475 | ppl 8.6 | 1610.1K tok/s | lr 1.85e-02 | VRAM 31448MB (gpu0)\n", " step 1480 | loss 2.0767 | ppl 8.0 | 1609.6K tok/s | lr 1.85e-02 | VRAM 31448MB (gpu0)\n", " step 1500 | loss 2.1484 | ppl 8.6 | 1608.3K tok/s | lr 1.85e-02 | VRAM 31448MB (gpu0)\n", " step 1520 | loss 2.1200 | ppl 8.3 | 1606.7K tok/s | lr 1.84e-02 | VRAM 31448MB (gpu0)\n", " step 1540 | loss 2.1800 | ppl 8.8 | 1605.9K tok/s | lr 1.84e-02 | VRAM 31448MB (gpu0)\n", " step 1560 | loss 2.1378 | ppl 8.5 | 1605.3K tok/s | lr 1.83e-02 | VRAM 31448MB (gpu0)\n", " step 1580 | loss 2.2037 | ppl 9.1 | 1605.4K tok/s | lr 1.83e-02 | VRAM 31448MB (gpu0)\n", " step 1600 | loss 2.0391 | ppl 7.7 | 1605.5K tok/s | lr 1.82e-02 | VRAM 31448MB (gpu0)\n", " step 1620 | loss 1.9915 | ppl 7.3 | 1605.8K tok/s | lr 1.82e-02 | VRAM 31448MB (gpu0)\n", " step 1640 | loss 2.1661 | ppl 8.7 | 1606.3K tok/s | lr 1.82e-02 | VRAM 31448MB (gpu0)\n", " step 1660 | loss 2.0260 | ppl 7.6 | 1607.2K tok/s | lr 1.81e-02 | VRAM 31448MB (gpu0)\n", " step 1680 | loss 2.1435 | ppl 8.5 | 1608.5K tok/s | lr 1.81e-02 | VRAM 31448MB (gpu0)\n", " step 1700 | loss 2.0736 | ppl 8.0 | 1609.1K tok/s | lr 1.80e-02 | VRAM 31448MB (gpu0)\n", " step 1720 | loss 2.0491 | ppl 7.8 | 1610.0K tok/s | lr 1.80e-02 | VRAM 31448MB (gpu0)\n", " step 1740 | loss 2.0629 | ppl 7.9 | 1609.9K tok/s | lr 1.79e-02 | VRAM 31448MB (gpu0)\n", " step 1760 | loss 2.1076 | ppl 8.2 | 1610.3K tok/s | lr 1.79e-02 | VRAM 31448MB (gpu0)\n", " step 1780 | loss 2.1220 | ppl 8.3 | 1609.9K tok/s | lr 1.78e-02 | VRAM 31448MB (gpu0)\n", " step 1800 | loss 1.9703 | ppl 7.2 | 1609.8K tok/s | lr 1.78e-02 | VRAM 31448MB (gpu0)\n", " step 1820 | loss 2.1035 | ppl 8.2 | 1609.8K tok/s | lr 1.77e-02 | VRAM 31448MB (gpu0)\n", " step 1840 | loss 2.0056 | ppl 7.4 | 1610.9K tok/s | lr 1.77e-02 | VRAM 31448MB (gpu0)\n", " step 1860 | loss 2.1211 | ppl 8.3 | 1611.0K tok/s | lr 1.76e-02 | VRAM 31448MB (gpu0)\n", " step 1880 | loss 2.1725 | ppl 8.8 | 1610.7K tok/s | lr 1.76e-02 | VRAM 31448MB (gpu0)\n", " step 1900 | loss 2.0434 | ppl 7.7 | 1610.7K tok/s | lr 1.75e-02 | VRAM 31448MB (gpu0)\n", " step 1920 | loss 2.1326 | ppl 8.4 | 1610.6K tok/s | lr 1.75e-02 | VRAM 31448MB (gpu0)\n", " step 1940 | loss 2.1505 | ppl 8.6 | 1610.5K tok/s | lr 1.74e-02 | VRAM 31448MB (gpu0)\n", " step 1960 | loss 2.1752 | ppl 8.8 | 1610.2K tok/s | lr 1.74e-02 | VRAM 31448MB (gpu0)\n", " step 1980 | loss 2.0641 | ppl 7.9 | 1610.1K tok/s | lr 1.73e-02 | VRAM 31448MB (gpu0)\n", " step 2000 | loss 2.0983 | ppl 8.2 | 1608.4K tok/s | lr 1.73e-02 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 32.2 ★ best\n", " step 2020 | loss 2.1343 | ppl 8.5 | 1297.8K tok/s | lr 1.72e-02 | VRAM 31448MB (gpu0)\n", " step 2040 | loss 2.1439 | ppl 8.5 | 1606.0K tok/s | lr 1.72e-02 | VRAM 31448MB (gpu0)\n", " step 2060 | loss 2.0963 | ppl 8.1 | 1605.6K tok/s | lr 1.71e-02 | VRAM 31448MB (gpu0)\n", " step 2080 | loss 2.1733 | ppl 8.8 | 1605.9K tok/s | lr 1.71e-02 | VRAM 31448MB (gpu0)\n", " step 2100 | loss 1.9717 | ppl 7.2 | 1606.1K tok/s | lr 1.70e-02 | VRAM 31448MB (gpu0)\n", " step 2120 | loss 1.9996 | ppl 7.4 | 1606.3K tok/s | lr 1.70e-02 | VRAM 31448MB (gpu0)\n", " step 2140 | loss 2.0215 | ppl 7.5 | 1606.8K tok/s | lr 1.69e-02 | VRAM 31448MB (gpu0)\n", " step 2160 | loss 2.0760 | ppl 8.0 | 1608.0K tok/s | lr 1.69e-02 | VRAM 31448MB (gpu0)\n", " step 2180 | loss 2.0273 | ppl 7.6 | 1608.5K tok/s | lr 1.68e-02 | VRAM 31448MB (gpu0)\n", " step 2200 | loss 2.1568 | ppl 8.6 | 1609.8K tok/s | lr 1.68e-02 | VRAM 31448MB (gpu0)\n", " step 2220 | loss 2.0358 | ppl 7.7 | 1609.8K tok/s | lr 1.67e-02 | VRAM 31448MB (gpu0)\n", " step 2240 | loss 2.0068 | ppl 7.4 | 1609.9K tok/s | lr 1.66e-02 | VRAM 31448MB (gpu0)\n", " step 2260 | loss 2.0378 | ppl 7.7 | 1610.1K tok/s | lr 1.66e-02 | VRAM 31448MB (gpu0)\n", " step 2280 | loss 2.0892 | ppl 8.1 | 1609.6K tok/s | lr 1.65e-02 | VRAM 31448MB (gpu0)\n", " step 2300 | loss 2.0398 | ppl 7.7 | 1609.8K tok/s | lr 1.65e-02 | VRAM 31448MB (gpu0)\n", " step 2320 | loss 2.0969 | ppl 8.1 | 1610.1K tok/s | lr 1.64e-02 | VRAM 31448MB (gpu0)\n", " step 2340 | loss 2.1048 | ppl 8.2 | 1610.5K tok/s | lr 1.64e-02 | VRAM 31448MB (gpu0)\n", " step 2360 | loss 1.9970 | ppl 7.4 | 1610.3K tok/s | lr 1.63e-02 | VRAM 31448MB (gpu0)\n", " step 2380 | loss 2.0839 | ppl 8.0 | 1610.2K tok/s | lr 1.62e-02 | VRAM 31448MB (gpu0)\n", " step 2400 | loss 2.1341 | ppl 8.4 | 1610.3K tok/s | lr 1.62e-02 | VRAM 31448MB (gpu0)\n", " step 2420 | loss 2.0340 | ppl 7.6 | 1610.0K tok/s | lr 1.61e-02 | VRAM 31448MB (gpu0)\n", " step 2440 | loss 2.0412 | ppl 7.7 | 1609.7K tok/s | lr 1.61e-02 | VRAM 31448MB (gpu0)\n", " step 2460 | loss 2.1370 | ppl 8.5 | 1609.6K tok/s | lr 1.60e-02 | VRAM 31448MB (gpu0)\n", " step 2480 | loss 1.8913 | ppl 6.6 | 1609.5K tok/s | lr 1.59e-02 | VRAM 31448MB (gpu0)\n", " step 2500 | loss 2.2418 | ppl 9.4 | 1608.2K tok/s | lr 1.59e-02 | VRAM 31448MB (gpu0)\n", " step 2520 | loss 2.1068 | ppl 8.2 | 1606.9K tok/s | lr 1.58e-02 | VRAM 31448MB (gpu0)\n", " step 2540 | loss 2.0135 | ppl 7.5 | 1606.2K tok/s | lr 1.58e-02 | VRAM 31448MB (gpu0)\n", " step 2560 | loss 2.0952 | ppl 8.1 | 1605.9K tok/s | lr 1.57e-02 | VRAM 31448MB (gpu0)\n", " step 2580 | loss 2.0622 | ppl 7.9 | 1605.8K tok/s | lr 1.56e-02 | VRAM 31448MB (gpu0)\n", " step 2600 | loss 2.0545 | ppl 7.8 | 1605.6K tok/s | lr 1.56e-02 | VRAM 31448MB (gpu0)\n", " step 2620 | loss 2.0261 | ppl 7.6 | 1605.3K tok/s | lr 1.55e-02 | VRAM 31448MB (gpu0)\n", " step 2640 | loss 2.0583 | ppl 7.8 | 1607.0K tok/s | lr 1.54e-02 | VRAM 31448MB (gpu0)\n", " step 2660 | loss 1.9980 | ppl 7.4 | 1607.6K tok/s | lr 1.54e-02 | VRAM 31448MB (gpu0)\n", " step 2680 | loss 2.1522 | ppl 8.6 | 1608.7K tok/s | lr 1.53e-02 | VRAM 31448MB (gpu0)\n", " step 2700 | loss 2.0973 | ppl 8.1 | 1609.4K tok/s | lr 1.53e-02 | VRAM 31448MB (gpu0)\n", " step 2720 | loss 1.9967 | ppl 7.4 | 1610.1K tok/s | lr 1.52e-02 | VRAM 31448MB (gpu0)\n", " step 2740 | loss 2.0873 | ppl 8.1 | 1610.2K tok/s | lr 1.51e-02 | VRAM 31448MB (gpu0)\n", " step 2760 | loss 2.0063 | ppl 7.4 | 1610.3K tok/s | lr 1.51e-02 | VRAM 31448MB (gpu0)\n", " step 2780 | loss 2.0674 | ppl 7.9 | 1610.1K tok/s | lr 1.50e-02 | VRAM 31448MB (gpu0)\n", " step 2800 | loss 2.1781 | ppl 8.8 | 1610.3K tok/s | lr 1.49e-02 | VRAM 31448MB (gpu0)\n", " step 2820 | loss 2.1187 | ppl 8.3 | 1610.3K tok/s | lr 1.49e-02 | VRAM 31448MB (gpu0)\n", " step 2840 | loss 1.9948 | ppl 7.4 | 1610.1K tok/s | lr 1.48e-02 | VRAM 31448MB (gpu0)\n", " step 2860 | loss 1.9749 | ppl 7.2 | 1610.4K tok/s | lr 1.47e-02 | VRAM 31448MB (gpu0)\n", " step 2880 | loss 2.0186 | ppl 7.5 | 1610.5K tok/s | lr 1.47e-02 | VRAM 31448MB (gpu0)\n", " step 2900 | loss 2.0017 | ppl 7.4 | 1610.5K tok/s | lr 1.46e-02 | VRAM 31448MB (gpu0)\n", " step 2920 | loss 1.9965 | ppl 7.4 | 1610.4K tok/s | lr 1.45e-02 | VRAM 31448MB (gpu0)\n", " step 2940 | loss 2.0631 | ppl 7.9 | 1610.4K tok/s | lr 1.45e-02 | VRAM 31448MB (gpu0)\n", " step 2960 | loss 2.1090 | ppl 8.2 | 1610.1K tok/s | lr 1.44e-02 | VRAM 31448MB (gpu0)\n", " step 2980 | loss 2.0849 | ppl 8.0 | 1610.1K tok/s | lr 1.43e-02 | VRAM 31448MB (gpu0)\n", " step 3000 | loss 2.0742 | ppl 8.0 | 1609.4K tok/s | lr 1.43e-02 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 30.3 ★ best\n", " step 3020 | loss 2.0531 | ppl 7.8 | 1298.1K tok/s | lr 1.42e-02 | VRAM 31448MB (gpu0)\n", " step 3040 | loss 2.0449 | ppl 7.7 | 1607.4K tok/s | lr 1.41e-02 | VRAM 31448MB (gpu0)\n", " step 3060 | loss 2.0201 | ppl 7.5 | 1606.9K tok/s | lr 1.41e-02 | VRAM 31448MB (gpu0)\n", " step 3080 | loss 1.9437 | ppl 7.0 | 1607.1K tok/s | lr 1.40e-02 | VRAM 31448MB (gpu0)\n", " step 3100 | loss 1.9672 | ppl 7.2 | 1607.0K tok/s | lr 1.39e-02 | VRAM 31448MB (gpu0)\n", " step 3120 | loss 1.8966 | ppl 6.7 | 1607.0K tok/s | lr 1.39e-02 | VRAM 31448MB (gpu0)\n", " step 3140 | loss 2.0152 | ppl 7.5 | 1607.4K tok/s | lr 1.38e-02 | VRAM 31448MB (gpu0)\n", " step 3160 | loss 1.8725 | ppl 6.5 | 1607.8K tok/s | lr 1.37e-02 | VRAM 31448MB (gpu0)\n", " step 3180 | loss 1.9711 | ppl 7.2 | 1608.5K tok/s | lr 1.37e-02 | VRAM 31448MB (gpu0)\n", " step 3200 | loss 2.1174 | ppl 8.3 | 1609.6K tok/s | lr 1.36e-02 | VRAM 31448MB (gpu0)\n", " step 3220 | loss 2.0913 | ppl 8.1 | 1610.1K tok/s | lr 1.35e-02 | VRAM 31448MB (gpu0)\n", " step 3240 | loss 2.0472 | ppl 7.7 | 1610.5K tok/s | lr 1.34e-02 | VRAM 31448MB (gpu0)\n", " step 3260 | loss 1.9597 | ppl 7.1 | 1610.3K tok/s | lr 1.34e-02 | VRAM 31448MB (gpu0)\n", " step 3280 | loss 2.0370 | ppl 7.7 | 1610.2K tok/s | lr 1.33e-02 | VRAM 31448MB (gpu0)\n", " step 3300 | loss 1.9662 | ppl 7.1 | 1610.2K tok/s | lr 1.32e-02 | VRAM 31448MB (gpu0)\n", " step 3320 | loss 2.0168 | ppl 7.5 | 1610.4K tok/s | lr 1.32e-02 | VRAM 31448MB (gpu0)\n", " step 3340 | loss 1.9211 | ppl 6.8 | 1610.2K tok/s | lr 1.31e-02 | VRAM 31448MB (gpu0)\n", " step 3360 | loss 2.1277 | ppl 8.4 | 1610.3K tok/s | lr 1.30e-02 | VRAM 31448MB (gpu0)\n", " step 3380 | loss 1.9202 | ppl 6.8 | 1610.2K tok/s | lr 1.30e-02 | VRAM 31448MB (gpu0)\n", " step 3400 | loss 2.0552 | ppl 7.8 | 1610.3K tok/s | lr 1.29e-02 | VRAM 31448MB (gpu0)\n", " step 3420 | loss 2.1132 | ppl 8.3 | 1610.1K tok/s | lr 1.28e-02 | VRAM 31448MB (gpu0)\n", " step 3440 | loss 2.0328 | ppl 7.6 | 1610.1K tok/s | lr 1.27e-02 | VRAM 31448MB (gpu0)\n", " step 3460 | loss 2.0330 | ppl 7.6 | 1609.9K tok/s | lr 1.27e-02 | VRAM 31448MB (gpu0)\n", " step 3480 | loss 2.0962 | ppl 8.1 | 1609.9K tok/s | lr 1.26e-02 | VRAM 31448MB (gpu0)\n", " step 3500 | loss 2.0655 | ppl 7.9 | 1609.3K tok/s | lr 1.25e-02 | VRAM 31448MB (gpu0)\n", " step 3520 | loss 1.9846 | ppl 7.3 | 1608.2K tok/s | lr 1.25e-02 | VRAM 31448MB (gpu0)\n", " step 3540 | loss 2.0917 | ppl 8.1 | 1607.4K tok/s | lr 1.24e-02 | VRAM 31448MB (gpu0)\n", " step 3560 | loss 2.1021 | ppl 8.2 | 1606.4K tok/s | lr 1.23e-02 | VRAM 31448MB (gpu0)\n", " step 3580 | loss 2.1074 | ppl 8.2 | 1606.1K tok/s | lr 1.22e-02 | VRAM 31448MB (gpu0)\n", " step 3600 | loss 1.9580 | ppl 7.1 | 1606.5K tok/s | lr 1.22e-02 | VRAM 31448MB (gpu0)\n", " step 3620 | loss 1.9768 | ppl 7.2 | 1607.2K tok/s | lr 1.21e-02 | VRAM 31448MB (gpu0)\n", " step 3640 | loss 2.0425 | ppl 7.7 | 1607.4K tok/s | lr 1.20e-02 | VRAM 31448MB (gpu0)\n", " step 3660 | loss 2.0235 | ppl 7.6 | 1607.9K tok/s | lr 1.20e-02 | VRAM 31448MB (gpu0)\n", " step 3680 | loss 2.0153 | ppl 7.5 | 1607.3K tok/s | lr 1.19e-02 | VRAM 31448MB (gpu0)\n", " step 3700 | loss 1.8855 | ppl 6.6 | 1609.3K tok/s | lr 1.18e-02 | VRAM 31448MB (gpu0)\n", " step 3720 | loss 1.9656 | ppl 7.1 | 1610.2K tok/s | lr 1.17e-02 | VRAM 31448MB (gpu0)\n", " step 3740 | loss 2.0532 | ppl 7.8 | 1610.8K tok/s | lr 1.17e-02 | VRAM 31448MB (gpu0)\n", " step 3760 | loss 2.0347 | ppl 7.7 | 1610.8K tok/s | lr 1.16e-02 | VRAM 31448MB (gpu0)\n", " step 3780 | loss 2.0256 | ppl 7.6 | 1610.7K tok/s | lr 1.15e-02 | VRAM 31448MB (gpu0)\n", " step 3800 | loss 2.0297 | ppl 7.6 | 1610.5K tok/s | lr 1.15e-02 | VRAM 31448MB (gpu0)\n", " step 3820 | loss 1.9312 | ppl 6.9 | 1610.2K tok/s | lr 1.14e-02 | VRAM 31448MB (gpu0)\n", " step 3840 | loss 2.0593 | ppl 7.8 | 1610.7K tok/s | lr 1.13e-02 | VRAM 31448MB (gpu0)\n", " step 3860 | loss 1.9678 | ppl 7.2 | 1610.9K tok/s | lr 1.12e-02 | VRAM 31448MB (gpu0)\n", " step 3880 | loss 1.9638 | ppl 7.1 | 1610.9K tok/s | lr 1.12e-02 | VRAM 31448MB (gpu0)\n", " step 3900 | loss 1.9963 | ppl 7.4 | 1610.8K tok/s | lr 1.11e-02 | VRAM 31448MB (gpu0)\n", " step 3920 | loss 2.1167 | ppl 8.3 | 1610.7K tok/s | lr 1.10e-02 | VRAM 31448MB (gpu0)\n", " step 3940 | loss 2.0226 | ppl 7.6 | 1610.5K tok/s | lr 1.09e-02 | VRAM 31448MB (gpu0)\n", " step 3960 | loss 2.0221 | ppl 7.6 | 1610.4K tok/s | lr 1.09e-02 | VRAM 31448MB (gpu0)\n", " step 3980 | loss 1.9330 | ppl 6.9 | 1610.6K tok/s | lr 1.08e-02 | VRAM 31448MB (gpu0)\n", " step 4000 | loss 2.0708 | ppl 7.9 | 1610.5K tok/s | lr 1.07e-02 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 28.8 ★ best\n", " >>> saved checkpoints/pretrain/step_0004000.pt\n", " step 4020 | loss 2.0722 | ppl 7.9 | 1294.3K tok/s | lr 1.07e-02 | VRAM 31448MB (gpu0)\n", " step 4040 | loss 2.0550 | ppl 7.8 | 1608.3K tok/s | lr 1.06e-02 | VRAM 31448MB (gpu0)\n", " step 4060 | loss 1.8754 | ppl 6.5 | 1606.0K tok/s | lr 1.05e-02 | VRAM 31448MB (gpu0)\n", " step 4080 | loss 1.9837 | ppl 7.3 | 1606.2K tok/s | lr 1.04e-02 | VRAM 31448MB (gpu0)\n", " step 4100 | loss 2.0320 | ppl 7.6 | 1606.3K tok/s | lr 1.04e-02 | VRAM 31448MB (gpu0)\n", " step 4120 | loss 1.9634 | ppl 7.1 | 1606.3K tok/s | lr 1.03e-02 | VRAM 31448MB (gpu0)\n", " step 4140 | loss 1.9511 | ppl 7.0 | 1606.6K tok/s | lr 1.02e-02 | VRAM 31448MB (gpu0)\n", " step 4160 | loss 2.0010 | ppl 7.4 | 1606.9K tok/s | lr 1.01e-02 | VRAM 31448MB (gpu0)\n", " step 4180 | loss 1.9587 | ppl 7.1 | 1607.4K tok/s | lr 1.01e-02 | VRAM 31448MB (gpu0)\n", " step 4200 | loss 2.0787 | ppl 8.0 | 1608.2K tok/s | lr 1.00e-02 | VRAM 31448MB (gpu0)\n", " step 4220 | loss 1.9624 | ppl 7.1 | 1609.3K tok/s | lr 9.93e-03 | VRAM 31448MB (gpu0)\n", " step 4240 | loss 1.9388 | ppl 7.0 | 1609.7K tok/s | lr 9.86e-03 | VRAM 31448MB (gpu0)\n", " step 4260 | loss 1.9974 | ppl 7.4 | 1609.3K tok/s | lr 9.79e-03 | VRAM 31448MB (gpu0)\n", " step 4280 | loss 2.0030 | ppl 7.4 | 1609.3K tok/s | lr 9.72e-03 | VRAM 31448MB (gpu0)\n", " step 4300 | loss 2.0589 | ppl 7.8 | 1609.6K tok/s | lr 9.65e-03 | VRAM 31448MB (gpu0)\n", " step 4320 | loss 1.9713 | ppl 7.2 | 1609.7K tok/s | lr 9.57e-03 | VRAM 31448MB (gpu0)\n", " step 4340 | loss 1.9705 | ppl 7.2 | 1610.0K tok/s | lr 9.50e-03 | VRAM 31448MB (gpu0)\n", " step 4360 | loss 2.0488 | ppl 7.8 | 1610.2K tok/s | lr 9.43e-03 | VRAM 31448MB (gpu0)\n", " step 4380 | loss 1.8644 | ppl 6.5 | 1610.0K tok/s | lr 9.36e-03 | VRAM 31448MB (gpu0)\n", " step 4400 | loss 2.1170 | ppl 8.3 | 1610.2K tok/s | lr 9.29e-03 | VRAM 31448MB (gpu0)\n", " step 4420 | loss 2.0438 | ppl 7.7 | 1610.0K tok/s | lr 9.22e-03 | VRAM 31448MB (gpu0)\n", " step 4440 | loss 1.9366 | ppl 6.9 | 1609.8K tok/s | lr 9.15e-03 | VRAM 31448MB (gpu0)\n", " step 4460 | loss 2.0579 | ppl 7.8 | 1609.6K tok/s | lr 9.08e-03 | VRAM 31448MB (gpu0)\n", " step 4480 | loss 1.9404 | ppl 7.0 | 1609.7K tok/s | lr 9.00e-03 | VRAM 31448MB (gpu0)\n", " step 4500 | loss 1.9999 | ppl 7.4 | 1609.8K tok/s | lr 8.93e-03 | VRAM 31448MB (gpu0)\n", " step 4520 | loss 2.0660 | ppl 7.9 | 1609.1K tok/s | lr 8.86e-03 | VRAM 31448MB (gpu0)\n", " step 4540 | loss 2.0084 | ppl 7.5 | 1607.9K tok/s | lr 8.79e-03 | VRAM 31448MB (gpu0)\n", " step 4560 | loss 1.9458 | ppl 7.0 | 1606.8K tok/s | lr 8.72e-03 | VRAM 31448MB (gpu0)\n", " step 4580 | loss 2.0112 | ppl 7.5 | 1605.7K tok/s | lr 8.65e-03 | VRAM 31448MB (gpu0)\n", " step 4600 | loss 1.9962 | ppl 7.4 | 1605.3K tok/s | lr 8.58e-03 | VRAM 31448MB (gpu0)\n", " step 4620 | loss 1.9305 | ppl 6.9 | 1606.0K tok/s | lr 8.51e-03 | VRAM 31448MB (gpu0)\n", " step 4640 | loss 1.9810 | ppl 7.3 | 1606.5K tok/s | lr 8.44e-03 | VRAM 31448MB (gpu0)\n", " step 4660 | loss 1.9487 | ppl 7.0 | 1606.9K tok/s | lr 8.37e-03 | VRAM 31448MB (gpu0)\n", " step 4680 | loss 1.9423 | ppl 7.0 | 1607.7K tok/s | lr 8.31e-03 | VRAM 31448MB (gpu0)\n", " step 4700 | loss 1.9552 | ppl 7.1 | 1608.1K tok/s | lr 8.24e-03 | VRAM 31448MB (gpu0)\n", " step 4720 | loss 1.8697 | ppl 6.5 | 1608.9K tok/s | lr 8.17e-03 | VRAM 31448MB (gpu0)\n", " step 4740 | loss 1.9725 | ppl 7.2 | 1609.6K tok/s | lr 8.10e-03 | VRAM 31448MB (gpu0)\n", " step 4760 | loss 1.8956 | ppl 6.7 | 1609.7K tok/s | lr 8.03e-03 | VRAM 31448MB (gpu0)\n", " step 4780 | loss 1.8904 | ppl 6.6 | 1609.8K tok/s | lr 7.96e-03 | VRAM 31448MB (gpu0)\n", " step 4800 | loss 1.9628 | ppl 7.1 | 1609.5K tok/s | lr 7.89e-03 | VRAM 31448MB (gpu0)\n", " step 4820 | loss 2.0439 | ppl 7.7 | 1609.8K tok/s | lr 7.83e-03 | VRAM 31448MB (gpu0)\n", " step 4840 | loss 2.0295 | ppl 7.6 | 1609.8K tok/s | lr 7.76e-03 | VRAM 31448MB (gpu0)\n", " step 4860 | loss 1.8887 | ppl 6.6 | 1610.1K tok/s | lr 7.69e-03 | VRAM 31448MB (gpu0)\n", " step 4880 | loss 2.0331 | ppl 7.6 | 1610.2K tok/s | lr 7.62e-03 | VRAM 31448MB (gpu0)\n", " step 4900 | loss 1.8703 | ppl 6.5 | 1610.3K tok/s | lr 7.56e-03 | VRAM 31448MB (gpu0)\n", " step 4920 | loss 1.9509 | ppl 7.0 | 1610.1K tok/s | lr 7.49e-03 | VRAM 31448MB (gpu0)\n", " step 4940 | loss 1.9827 | ppl 7.3 | 1610.1K tok/s | lr 7.42e-03 | VRAM 31448MB (gpu0)\n", " step 4960 | loss 1.8391 | ppl 6.3 | 1609.9K tok/s | lr 7.36e-03 | VRAM 31448MB (gpu0)\n", " step 4980 | loss 1.8900 | ppl 6.6 | 1609.9K tok/s | lr 7.29e-03 | VRAM 31448MB (gpu0)\n", " step 5000 | loss 1.9576 | ppl 7.1 | 1609.7K tok/s | lr 7.23e-03 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 27.9 ★ best\n", " step 5020 | loss 2.0607 | ppl 7.9 | 1300.0K tok/s | lr 7.16e-03 | VRAM 31448MB (gpu0)\n", " step 5040 | loss 2.0121 | ppl 7.5 | 1608.4K tok/s | lr 7.09e-03 | VRAM 31448MB (gpu0)\n", " step 5060 | loss 1.8506 | ppl 6.4 | 1607.5K tok/s | lr 7.03e-03 | VRAM 31448MB (gpu0)\n", " step 5080 | loss 1.9572 | ppl 7.1 | 1606.5K tok/s | lr 6.96e-03 | VRAM 31448MB (gpu0)\n", " step 5100 | loss 1.9674 | ppl 7.2 | 1606.3K tok/s | lr 6.90e-03 | VRAM 31448MB (gpu0)\n", " step 5120 | loss 1.9142 | ppl 6.8 | 1606.1K tok/s | lr 6.84e-03 | VRAM 31448MB (gpu0)\n", " step 5140 | loss 2.0374 | ppl 7.7 | 1606.5K tok/s | lr 6.77e-03 | VRAM 31448MB (gpu0)\n", " step 5160 | loss 1.9355 | ppl 6.9 | 1606.8K tok/s | lr 6.71e-03 | VRAM 31448MB (gpu0)\n", " step 5180 | loss 1.9416 | ppl 7.0 | 1607.5K tok/s | lr 6.64e-03 | VRAM 31448MB (gpu0)\n", " step 5200 | loss 1.9186 | ppl 6.8 | 1607.9K tok/s | lr 6.58e-03 | VRAM 31448MB (gpu0)\n", " step 5220 | loss 1.9515 | ppl 7.0 | 1609.3K tok/s | lr 6.52e-03 | VRAM 31448MB (gpu0)\n", " step 5240 | loss 1.8576 | ppl 6.4 | 1610.0K tok/s | lr 6.46e-03 | VRAM 31448MB (gpu0)\n", " step 5260 | loss 2.0489 | ppl 7.8 | 1609.9K tok/s | lr 6.39e-03 | VRAM 31448MB (gpu0)\n", " step 5280 | loss 1.9646 | ppl 7.1 | 1609.8K tok/s | lr 6.33e-03 | VRAM 31448MB (gpu0)\n", " step 5300 | loss 1.8471 | ppl 6.3 | 1609.8K tok/s | lr 6.27e-03 | VRAM 31448MB (gpu0)\n", " step 5320 | loss 2.0385 | ppl 7.7 | 1609.8K tok/s | lr 6.21e-03 | VRAM 31448MB (gpu0)\n", " step 5340 | loss 1.9771 | ppl 7.2 | 1610.4K tok/s | lr 6.15e-03 | VRAM 31448MB (gpu0)\n", " step 5360 | loss 1.9362 | ppl 6.9 | 1610.6K tok/s | lr 6.09e-03 | VRAM 31448MB (gpu0)\n", " step 5380 | loss 1.8951 | ppl 6.7 | 1610.5K tok/s | lr 6.02e-03 | VRAM 31448MB (gpu0)\n", " step 5400 | loss 2.0317 | ppl 7.6 | 1610.7K tok/s | lr 5.96e-03 | VRAM 31448MB (gpu0)\n", " step 5420 | loss 2.0362 | ppl 7.7 | 1610.8K tok/s | lr 5.90e-03 | VRAM 31448MB (gpu0)\n", " step 5440 | loss 2.0197 | ppl 7.5 | 1609.6K tok/s | lr 5.85e-03 | VRAM 31448MB (gpu0)\n", " step 5460 | loss 2.0168 | ppl 7.5 | 1609.5K tok/s | lr 5.79e-03 | VRAM 31448MB (gpu0)\n", " step 5480 | loss 2.0258 | ppl 7.6 | 1609.3K tok/s | lr 5.73e-03 | VRAM 31448MB (gpu0)\n", " step 5500 | loss 1.9372 | ppl 6.9 | 1608.9K tok/s | lr 5.67e-03 | VRAM 31448MB (gpu0)\n", " step 5520 | loss 1.8314 | ppl 6.2 | 1609.0K tok/s | lr 5.61e-03 | VRAM 31448MB (gpu0)\n", " step 5540 | loss 1.8612 | ppl 6.4 | 1607.6K tok/s | lr 5.55e-03 | VRAM 31448MB (gpu0)\n", " step 5560 | loss 1.8851 | ppl 6.6 | 1607.1K tok/s | lr 5.50e-03 | VRAM 31448MB (gpu0)\n", " step 5580 | loss 2.0113 | ppl 7.5 | 1605.9K tok/s | lr 5.44e-03 | VRAM 31448MB (gpu0)\n", " step 5600 | loss 1.9030 | ppl 6.7 | 1605.9K tok/s | lr 5.38e-03 | VRAM 31448MB (gpu0)\n", " step 5620 | loss 1.9369 | ppl 6.9 | 1606.0K tok/s | lr 5.32e-03 | VRAM 31448MB (gpu0)\n", " step 5640 | loss 1.8911 | ppl 6.6 | 1606.1K tok/s | lr 5.27e-03 | VRAM 31448MB (gpu0)\n", " step 5660 | loss 1.8896 | ppl 6.6 | 1606.5K tok/s | lr 5.21e-03 | VRAM 31448MB (gpu0)\n", " step 5680 | loss 2.0139 | ppl 7.5 | 1607.3K tok/s | lr 5.16e-03 | VRAM 31448MB (gpu0)\n", " step 5700 | loss 2.0731 | ppl 7.9 | 1607.9K tok/s | lr 5.10e-03 | VRAM 31448MB (gpu0)\n", " step 5720 | loss 2.0450 | ppl 7.7 | 1608.7K tok/s | lr 5.05e-03 | VRAM 31448MB (gpu0)\n", " step 5740 | loss 2.0806 | ppl 8.0 | 1609.7K tok/s | lr 4.99e-03 | VRAM 31448MB (gpu0)\n", " step 5760 | loss 1.9182 | ppl 6.8 | 1610.2K tok/s | lr 4.94e-03 | VRAM 31448MB (gpu0)\n", " step 5780 | loss 1.9837 | ppl 7.3 | 1609.8K tok/s | lr 4.89e-03 | VRAM 31448MB (gpu0)\n", " step 5800 | loss 1.8855 | ppl 6.6 | 1609.9K tok/s | lr 4.83e-03 | VRAM 31448MB (gpu0)\n", " step 5820 | loss 1.9811 | ppl 7.3 | 1609.8K tok/s | lr 4.78e-03 | VRAM 31448MB (gpu0)\n", " step 5840 | loss 1.9759 | ppl 7.2 | 1610.1K tok/s | lr 4.73e-03 | VRAM 31448MB (gpu0)\n", " step 5860 | loss 1.9156 | ppl 6.8 | 1610.3K tok/s | lr 4.68e-03 | VRAM 31448MB (gpu0)\n", " step 5880 | loss 1.9288 | ppl 6.9 | 1610.2K tok/s | lr 4.63e-03 | VRAM 31448MB (gpu0)\n", " step 5900 | loss 1.9313 | ppl 6.9 | 1610.3K tok/s | lr 4.58e-03 | VRAM 31448MB (gpu0)\n", " step 5920 | loss 1.9474 | ppl 7.0 | 1610.0K tok/s | lr 4.52e-03 | VRAM 31448MB (gpu0)\n", " step 5940 | loss 2.0303 | ppl 7.6 | 1610.1K tok/s | lr 4.47e-03 | VRAM 31448MB (gpu0)\n", " step 5960 | loss 2.0016 | ppl 7.4 | 1610.0K tok/s | lr 4.43e-03 | VRAM 31448MB (gpu0)\n", " step 5980 | loss 1.8876 | ppl 6.6 | 1609.9K tok/s | lr 4.38e-03 | VRAM 31448MB (gpu0)\n", " step 6000 | loss 2.0180 | ppl 7.5 | 1610.0K tok/s | lr 4.33e-03 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 27.3 ★ best\n", " step 6020 | loss 2.0032 | ppl 7.4 | 1299.4K tok/s | lr 4.28e-03 | VRAM 31448MB (gpu0)\n", " step 6040 | loss 1.9294 | ppl 6.9 | 1608.1K tok/s | lr 4.23e-03 | VRAM 31448MB (gpu0)\n", " step 6060 | loss 1.8339 | ppl 6.3 | 1607.4K tok/s | lr 4.18e-03 | VRAM 31448MB (gpu0)\n", " step 6080 | loss 1.8727 | ppl 6.5 | 1606.2K tok/s | lr 4.14e-03 | VRAM 31448MB (gpu0)\n", " step 6100 | loss 1.9415 | ppl 7.0 | 1605.9K tok/s | lr 4.09e-03 | VRAM 31448MB (gpu0)\n", " step 6120 | loss 2.0533 | ppl 7.8 | 1605.8K tok/s | lr 4.04e-03 | VRAM 31448MB (gpu0)\n", " step 6140 | loss 1.8949 | ppl 6.7 | 1605.8K tok/s | lr 4.00e-03 | VRAM 31448MB (gpu0)\n", " step 6160 | loss 2.0269 | ppl 7.6 | 1606.5K tok/s | lr 3.95e-03 | VRAM 31448MB (gpu0)\n", " step 6180 | loss 2.0380 | ppl 7.7 | 1607.0K tok/s | lr 3.91e-03 | VRAM 31448MB (gpu0)\n", " step 6200 | loss 1.9103 | ppl 6.8 | 1607.8K tok/s | lr 3.86e-03 | VRAM 31448MB (gpu0)\n", " step 6220 | loss 1.9726 | ppl 7.2 | 1608.8K tok/s | lr 3.82e-03 | VRAM 31448MB (gpu0)\n", " step 6240 | loss 1.8892 | ppl 6.6 | 1609.7K tok/s | lr 3.78e-03 | VRAM 31448MB (gpu0)\n", " step 6260 | loss 1.9306 | ppl 6.9 | 1609.7K tok/s | lr 3.73e-03 | VRAM 31448MB (gpu0)\n", " step 6280 | loss 1.9707 | ppl 7.2 | 1609.6K tok/s | lr 3.69e-03 | VRAM 31448MB (gpu0)\n", " step 6300 | loss 2.0127 | ppl 7.5 | 1609.7K tok/s | lr 3.65e-03 | VRAM 31448MB (gpu0)\n", " step 6320 | loss 2.1017 | ppl 8.2 | 1609.7K tok/s | lr 3.61e-03 | VRAM 31448MB (gpu0)\n", " step 6340 | loss 1.9381 | ppl 6.9 | 1609.6K tok/s | lr 3.57e-03 | VRAM 31448MB (gpu0)\n", " step 6360 | loss 1.8412 | ppl 6.3 | 1610.0K tok/s | lr 3.53e-03 | VRAM 31448MB (gpu0)\n", " step 6380 | loss 1.8524 | ppl 6.4 | 1610.2K tok/s | lr 3.49e-03 | VRAM 31448MB (gpu0)\n", " step 6400 | loss 1.9045 | ppl 6.7 | 1610.0K tok/s | lr 3.45e-03 | VRAM 31448MB (gpu0)\n", " step 6420 | loss 2.0578 | ppl 7.8 | 1610.3K tok/s | lr 3.41e-03 | VRAM 31448MB (gpu0)\n", " step 6440 | loss 1.9623 | ppl 7.1 | 1610.5K tok/s | lr 3.37e-03 | VRAM 31448MB (gpu0)\n", " step 6460 | loss 1.9256 | ppl 6.9 | 1610.4K tok/s | lr 3.33e-03 | VRAM 31448MB (gpu0)\n", " step 6480 | loss 1.9779 | ppl 7.2 | 1610.1K tok/s | lr 3.29e-03 | VRAM 31448MB (gpu0)\n", " step 6500 | loss 1.8777 | ppl 6.5 | 1609.9K tok/s | lr 3.25e-03 | VRAM 31448MB (gpu0)\n", " step 6520 | loss 1.9139 | ppl 6.8 | 1610.0K tok/s | lr 3.22e-03 | VRAM 31448MB (gpu0)\n", " step 6540 | loss 1.7972 | ppl 6.0 | 1609.3K tok/s | lr 3.18e-03 | VRAM 31448MB (gpu0)\n", " step 6560 | loss 1.9504 | ppl 7.0 | 1607.7K tok/s | lr 3.15e-03 | VRAM 31448MB (gpu0)\n", " step 6580 | loss 1.9010 | ppl 6.7 | 1607.1K tok/s | lr 3.11e-03 | VRAM 31448MB (gpu0)\n", " step 6600 | loss 1.9274 | ppl 6.9 | 1606.3K tok/s | lr 3.08e-03 | VRAM 31448MB (gpu0)\n", " step 6620 | loss 1.7478 | ppl 5.7 | 1606.3K tok/s | lr 3.04e-03 | VRAM 31448MB (gpu0)\n", " step 6640 | loss 1.9054 | ppl 6.7 | 1606.3K tok/s | lr 3.01e-03 | VRAM 31448MB (gpu0)\n", " step 6660 | loss 1.9885 | ppl 7.3 | 1606.5K tok/s | lr 2.98e-03 | VRAM 31448MB (gpu0)\n", " step 6680 | loss 1.9917 | ppl 7.3 | 1606.5K tok/s | lr 2.94e-03 | VRAM 31448MB (gpu0)\n", " step 6700 | loss 1.8184 | ppl 6.2 | 1607.6K tok/s | lr 2.91e-03 | VRAM 31448MB (gpu0)\n", " step 6720 | loss 1.9370 | ppl 6.9 | 1608.4K tok/s | lr 2.88e-03 | VRAM 31448MB (gpu0)\n", " step 6740 | loss 1.9577 | ppl 7.1 | 1609.4K tok/s | lr 2.85e-03 | VRAM 31448MB (gpu0)\n", " step 6760 | loss 1.9624 | ppl 7.1 | 1610.2K tok/s | lr 2.82e-03 | VRAM 31448MB (gpu0)\n", " step 6780 | loss 1.9268 | ppl 6.9 | 1610.5K tok/s | lr 2.79e-03 | VRAM 31448MB (gpu0)\n", " step 6800 | loss 1.9208 | ppl 6.8 | 1610.2K tok/s | lr 2.76e-03 | VRAM 31448MB (gpu0)\n", " step 6820 | loss 1.8497 | ppl 6.4 | 1610.5K tok/s | lr 2.73e-03 | VRAM 31448MB (gpu0)\n", " step 6840 | loss 1.9760 | ppl 7.2 | 1610.3K tok/s | lr 2.70e-03 | VRAM 31448MB (gpu0)\n", " step 6860 | loss 1.8908 | ppl 6.6 | 1610.4K tok/s | lr 2.67e-03 | VRAM 31448MB (gpu0)\n", " step 6880 | loss 2.0065 | ppl 7.4 | 1610.5K tok/s | lr 2.65e-03 | VRAM 31448MB (gpu0)\n", " step 6900 | loss 2.0023 | ppl 7.4 | 1610.7K tok/s | lr 2.62e-03 | VRAM 31448MB (gpu0)\n", " step 6920 | loss 2.0563 | ppl 7.8 | 1610.5K tok/s | lr 2.59e-03 | VRAM 31448MB (gpu0)\n", " step 6940 | loss 2.0022 | ppl 7.4 | 1610.2K tok/s | lr 2.57e-03 | VRAM 31448MB (gpu0)\n", " step 6960 | loss 1.9169 | ppl 6.8 | 1610.5K tok/s | lr 2.54e-03 | VRAM 31448MB (gpu0)\n", " step 6980 | loss 1.8696 | ppl 6.5 | 1610.3K tok/s | lr 2.52e-03 | VRAM 31448MB (gpu0)\n", " step 7000 | loss 1.9181 | ppl 6.8 | 1610.5K tok/s | lr 2.49e-03 | VRAM 31448MB (gpu0)\n", " >>> eval ppl 27.0 ★ best\n", " step 7020 | loss 1.8082 | ppl 6.1 | 1299.5K tok/s | lr 2.47e-03 | VRAM 31448MB (gpu0)\n", " step 7040 | loss 1.8858 | ppl 6.6 | 1609.6K tok/s | lr 2.45e-03 | VRAM 31448MB (gpu0)\n", " step 7060 | loss 2.0710 | ppl 7.9 | 1608.3K tok/s | lr 2.43e-03 | VRAM 31448MB (gpu0)\n", " step 7080 | loss 1.9107 | ppl 6.8 | 1607.1K tok/s | lr 2.40e-03 | VRAM 31448MB (gpu0)\n", " step 7100 | loss 1.9342 | ppl 6.9 | 1606.4K tok/s | lr 2.38e-03 | VRAM 31448MB (gpu0)\n", " step 7120 | loss 1.9257 | ppl 6.9 | 1605.8K tok/s | lr 2.36e-03 | VRAM 31448MB (gpu0)\n", " step 7140 | loss 1.9028 | ppl 6.7 | 1606.0K tok/s | lr 2.34e-03 | VRAM 31448MB (gpu0)\n", " step 7160 | loss 1.9314 | ppl 6.9 | 1606.3K tok/s | lr 2.32e-03 | VRAM 31448MB (gpu0)\n", " step 7180 | loss 1.9930 | ppl 7.3 | 1606.6K tok/s | lr 2.30e-03 | VRAM 31448MB (gpu0)\n", " step 7200 | loss 2.0299 | ppl 7.6 | 1607.5K tok/s | lr 2.29e-03 | VRAM 31448MB (gpu0)\n", " step 7220 | loss 2.0436 | ppl 7.7 | 1608.4K tok/s | lr 2.27e-03 | VRAM 31448MB (gpu0)\n", " step 7240 | loss 2.0311 | ppl 7.6 | 1608.7K tok/s | lr 2.25e-03 | VRAM 31448MB (gpu0)\n", " step 7260 | loss 1.9633 | ppl 7.1 | 1609.9K tok/s | lr 2.23e-03 | VRAM 31448MB (gpu0)\n", " step 7280 | loss 1.8779 | ppl 6.5 | 1610.2K tok/s | lr 2.22e-03 | VRAM 31448MB (gpu0)\n", " step 7300 | loss 1.9481 | ppl 7.0 | 1609.8K tok/s | lr 2.20e-03 | VRAM 31448MB (gpu0)\n", " step 7320 | loss 1.9439 | ppl 7.0 | 1610.5K tok/s | lr 2.19e-03 | VRAM 31448MB (gpu0)\n", " step 7340 | loss 1.9916 | ppl 7.3 | 1610.6K tok/s | lr 2.17e-03 | VRAM 31448MB (gpu0)\n", " step 7360 | loss 1.7226 | ppl 5.6 | 1610.7K tok/s | lr 2.16e-03 | VRAM 31448MB (gpu0)\n", " step 7380 | loss 2.0380 | ppl 7.7 | 1610.6K tok/s | lr 2.15e-03 | VRAM 31448MB (gpu0)\n", " step 7400 | loss 1.9261 | ppl 6.9 | 1610.7K tok/s | lr 2.13e-03 | VRAM 31448MB (gpu0)\n", " step 7420 | loss 1.9495 | ppl 7.0 | 1610.4K tok/s | lr 2.12e-03 | VRAM 31448MB (gpu0)\n", " step 7440 | loss 1.9745 | ppl 7.2 | 1610.2K tok/s | lr 2.11e-03 | VRAM 31448MB (gpu0)\n", " step 7460 | loss 1.7383 | ppl 5.7 | 1610.1K tok/s | lr 2.10e-03 | VRAM 31448MB (gpu0)\n", " step 7480 | loss 1.9563 | ppl 7.1 | 1609.9K tok/s | lr 2.09e-03 | VRAM 31448MB (gpu0)\n", " step 7500 | loss 1.9294 | ppl 6.9 | 1610.2K tok/s | lr 2.08e-03 | VRAM 31448MB (gpu0)\n", " step 7520 | loss 1.9862 | ppl 7.3 | 1609.8K tok/s | lr 2.07e-03 | VRAM 31448MB (gpu0)\n", " step 7540 | loss 1.8008 | ppl 6.1 | 1610.0K tok/s | lr 2.06e-03 | VRAM 31448MB (gpu0)\n", " step 7560 | loss 1.8759 | ppl 6.5 | 1609.4K tok/s | lr 2.05e-03 | VRAM 31448MB (gpu0)\n", " step 7580 | loss 1.9313 | ppl 6.9 | 1608.2K tok/s | lr 2.04e-03 | VRAM 31448MB (gpu0)\n", " step 7600 | loss 1.8793 | ppl 6.5 | 1607.0K tok/s | lr 2.04e-03 | VRAM 31448MB (gpu0)\n", " step 7620 | loss 1.7750 | ppl 5.9 | 1606.1K tok/s | lr 2.03e-03 | VRAM 31448MB (gpu0)\n", " step 7640 | loss 1.9354 | ppl 6.9 | 1606.3K tok/s | lr 2.03e-03 | VRAM 31448MB (gpu0)\n", " step 7660 | loss 1.9977 | ppl 7.4 | 1606.5K tok/s | lr 2.02e-03 | VRAM 31448MB (gpu0)\n", " step 7680 | loss 2.0852 | ppl 8.0 | 1606.4K tok/s | lr 2.02e-03 | VRAM 31448MB (gpu0)\n", " step 7700 | loss 1.9413 | ppl 7.0 | 1607.3K tok/s | lr 2.01e-03 | VRAM 31448MB (gpu0)\n", " step 7720 | loss 1.9556 | ppl 7.1 | 1608.2K tok/s | lr 2.01e-03 | VRAM 31448MB (gpu0)\n", " step 7740 | loss 1.9129 | ppl 6.8 | 1608.7K tok/s | lr 2.01e-03 | VRAM 31448MB (gpu0)\n", " step 7760 | loss 1.8491 | ppl 6.4 | 1610.0K tok/s | lr 2.00e-03 | VRAM 31448MB (gpu0)\n", " step 7780 | loss 1.8255 | ppl 6.2 | 1610.1K tok/s | lr 2.00e-03 | VRAM 31448MB (gpu0)\n", " step 7800 | loss 1.8858 | ppl 6.6 | 1610.3K tok/s | lr 2.00e-03 | VRAM 31448MB (gpu0)\n", " step 7820 | loss 2.0572 | ppl 7.8 | 1610.4K tok/s | lr 2.00e-03 | VRAM 31448MB (gpu0)\n", "\n", "Final eval PPL: 27.0\n", "Final checkpoint: checkpoints/pretrain/step_0007827.pt\n", "\n", "======================================================================\n", "Training complete. 12,310,806,528 tokens processed.\n", "Best val PPL: 27.0\n", "======================================================================\n" ] } ], "source": [ "!torchrun --nproc_per_node=8 -m freqformer.train \\\n", " --preset small \\\n", " --distributed ddp \\\n", " --data_dir dolma_mix_10b \\\n", " --batch_size 3 \\\n", " --seq_len 16384 \\\n", " --grad_accum_steps 4 \\\n", " --num_epochs 1 \\\n", " --warmup_steps 20 \\\n", " --optimizer muon \\\n", " --lr 0.03 \\\n", " --lr_schedule cosine \\\n", " --log_every 20 \\\n", " --eval_every 1000 \\\n", " --checkpoint_every 4000 \\\n", " --checkpoint_dir checkpoints/pretrain" ] }, { "cell_type": "code", "execution_count": 6, "id": "34fd6667-f574-4734-b3af-93f06aa4fd74", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "W0213 19:52:56.449000 15338 torch/distributed/run.py:774] \n", "W0213 19:52:56.449000 15338 torch/distributed/run.py:774] *****************************************\n", "W0213 19:52:56.449000 15338 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", "W0213 19:52:56.449000 15338 torch/distributed/run.py:774] *****************************************\n", "======================================================================\n", "FreqFormer SFT (8 GPUs, DDP)\n", "======================================================================\n", "SFT data vocab_size=32003, pad_id=32000\n", "Model: 33.2M parameters (vocab=32003)\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", "Loading pretrained checkpoint: checkpoints/pretrain/step_0007827.pt\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", " Loaded pretrained model: step 7827, best PPL 26.975635832171747\n", "Wrapped with DDP (8 GPUs)\n", "SFT Train: 2,208,814 examples, seq_len=16384\n", "SFT Val: 44,870 examples, seq_len=16384\n", "Epoch-based training: 1 epochs x 23008 steps = 23008 total\n", "Optimizer: muon (multi-gpu)\n", " muon: 16.8M\n", " adamw: 16.4M\n", " total: 33.2M\n", "\n", "SFT Training: steps 0→23008, B=3x8gpu, T=16384, GA=4\n", "Effective batch: 1,572,864 tokens/step\n", "Device: NVIDIA GeForce RTX 5090\n", "======================================================================\n", " step 20 | loss 3.4798 | ppl 32.5 | 1367.1K tok/s | lr 1.90e-04 | asst 6% | VRAM 31440MB (gpu0)\n", " step 40 | loss 3.1088 | ppl 22.4 | 1551.0K tok/s | lr 3.90e-04 | asst 6% | VRAM 31440MB (gpu0)\n", " step 60 | loss 3.0359 | ppl 20.8 | 1547.9K tok/s | lr 5.90e-04 | asst 7% | VRAM 31440MB (gpu0)\n", " step 80 | loss 2.9624 | ppl 19.3 | 1548.1K tok/s | lr 7.90e-04 | asst 7% | VRAM 31440MB (gpu0)\n", " step 100 | loss 2.9744 | ppl 19.6 | 1547.3K tok/s | lr 9.90e-04 | asst 7% | VRAM 31440MB (gpu0)\n", " step 120 | loss 2.9470 | ppl 19.0 | 1546.8K tok/s | lr 1.19e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 140 | loss 2.9958 | ppl 20.0 | 1545.9K tok/s | lr 1.39e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 160 | loss 2.9699 | ppl 19.5 | 1546.5K tok/s | lr 1.59e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 180 | loss 2.9994 | ppl 20.1 | 1547.0K tok/s | lr 1.79e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 200 | loss 2.9754 | ppl 19.6 | 1546.4K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 220 | loss 2.9408 | ppl 18.9 | 1546.3K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 240 | loss 2.9525 | ppl 19.2 | 1547.0K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 260 | loss 2.9009 | ppl 18.2 | 1546.7K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 280 | loss 2.9861 | ppl 19.8 | 1547.4K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 300 | loss 2.9082 | ppl 18.3 | 1547.4K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 320 | loss 2.9524 | ppl 19.2 | 1547.0K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 340 | loss 2.9651 | ppl 19.4 | 1547.4K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 360 | loss 2.9345 | ppl 18.8 | 1546.7K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 380 | loss 2.9555 | ppl 19.2 | 1548.2K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 400 | loss 2.9559 | ppl 19.2 | 1547.4K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 420 | loss 2.9192 | ppl 18.5 | 1548.0K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 440 | loss 2.9178 | ppl 18.5 | 1548.7K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 460 | loss 2.9166 | ppl 18.5 | 1548.4K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 480 | loss 2.9356 | ppl 18.8 | 1548.1K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 500 | loss 2.9703 | ppl 19.5 | 1548.1K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 520 | loss 2.9824 | ppl 19.7 | 1521.2K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 540 | loss 2.9109 | ppl 18.4 | 1546.8K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 560 | loss 2.9292 | ppl 18.7 | 1546.4K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 580 | loss 2.9344 | ppl 18.8 | 1546.3K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 600 | loss 2.9195 | ppl 18.5 | 1546.5K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 620 | loss 2.9030 | ppl 18.2 | 1546.5K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 640 | loss 2.9325 | ppl 18.8 | 1528.4K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 660 | loss 2.9216 | ppl 18.6 | 1546.2K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 680 | loss 2.9043 | ppl 18.3 | 1546.4K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 700 | loss 2.9601 | ppl 19.3 | 1545.9K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 720 | loss 2.8520 | ppl 17.3 | 1546.9K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 740 | loss 2.9080 | ppl 18.3 | 1546.8K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 760 | loss 2.8629 | ppl 17.5 | 1547.2K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 780 | loss 2.8896 | ppl 18.0 | 1546.8K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 800 | loss 2.8918 | ppl 18.0 | 1547.3K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 820 | loss 2.9076 | ppl 18.3 | 1547.3K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 840 | loss 2.8690 | ppl 17.6 | 1546.8K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 860 | loss 2.9262 | ppl 18.7 | 1547.7K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 880 | loss 2.9442 | ppl 19.0 | 1548.3K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 900 | loss 2.9503 | ppl 19.1 | 1548.2K tok/s | lr 2.00e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 920 | loss 2.9170 | ppl 18.5 | 1547.5K tok/s | lr 2.00e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 940 | loss 2.9373 | ppl 18.9 | 1547.5K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 960 | loss 2.8740 | ppl 17.7 | 1548.4K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 980 | loss 2.9121 | ppl 18.4 | 1547.0K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1000 | loss 2.9158 | ppl 18.5 | 1547.4K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " >>> eval loss 2.8509 | ppl 17.3 | assistant_tokens 10,736,711 ★ best\n", " step 1020 | loss 2.8848 | ppl 17.9 | 1245.2K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1040 | loss 2.9075 | ppl 18.3 | 1546.7K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1060 | loss 2.9034 | ppl 18.2 | 1546.1K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1080 | loss 2.8749 | ppl 17.7 | 1546.0K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1100 | loss 2.9499 | ppl 19.1 | 1545.7K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1120 | loss 2.9192 | ppl 18.5 | 1545.9K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1140 | loss 2.9143 | ppl 18.4 | 1545.9K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1160 | loss 2.9702 | ppl 19.5 | 1545.9K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1180 | loss 2.9661 | ppl 19.4 | 1546.1K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1200 | loss 2.9133 | ppl 18.4 | 1544.7K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1220 | loss 2.8544 | ppl 17.4 | 1545.6K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1240 | loss 2.9389 | ppl 18.9 | 1547.0K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1260 | loss 2.9260 | ppl 18.7 | 1546.6K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1280 | loss 2.9159 | ppl 18.5 | 1546.5K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1300 | loss 2.9341 | ppl 18.8 | 1546.7K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1320 | loss 2.9302 | ppl 18.7 | 1546.9K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1340 | loss 2.9476 | ppl 19.1 | 1546.5K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1360 | loss 2.9001 | ppl 18.2 | 1546.2K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1380 | loss 2.8847 | ppl 17.9 | 1546.9K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1400 | loss 2.8922 | ppl 18.0 | 1547.4K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1420 | loss 2.9165 | ppl 18.5 | 1548.4K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1440 | loss 2.8685 | ppl 17.6 | 1548.0K tok/s | lr 1.99e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1460 | loss 2.9269 | ppl 18.7 | 1546.4K tok/s | lr 1.99e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1480 | loss 2.9099 | ppl 18.4 | 1546.0K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1500 | loss 2.8924 | ppl 18.0 | 1546.9K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1520 | loss 2.9010 | ppl 18.2 | 1545.7K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1540 | loss 2.8618 | ppl 17.5 | 1546.2K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1560 | loss 2.9098 | ppl 18.4 | 1545.5K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1580 | loss 2.9025 | ppl 18.2 | 1545.4K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1600 | loss 2.8874 | ppl 17.9 | 1545.3K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1620 | loss 2.8620 | ppl 17.5 | 1546.0K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1640 | loss 2.8806 | ppl 17.8 | 1545.4K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1660 | loss 2.8978 | ppl 18.1 | 1545.7K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1680 | loss 2.9052 | ppl 18.3 | 1546.7K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1700 | loss 2.9594 | ppl 19.3 | 1546.8K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1720 | loss 2.8845 | ppl 17.9 | 1546.5K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1740 | loss 2.8456 | ppl 17.2 | 1546.8K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1760 | loss 2.8790 | ppl 17.8 | 1546.9K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1780 | loss 2.9150 | ppl 18.4 | 1547.6K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1800 | loss 2.8532 | ppl 17.3 | 1547.3K tok/s | lr 1.98e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1820 | loss 2.8900 | ppl 18.0 | 1547.3K tok/s | lr 1.98e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1840 | loss 2.8487 | ppl 17.3 | 1547.6K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1860 | loss 2.8420 | ppl 17.2 | 1548.4K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1880 | loss 2.8916 | ppl 18.0 | 1549.3K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1900 | loss 2.7920 | ppl 16.3 | 1549.2K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1920 | loss 2.9035 | ppl 18.2 | 1548.4K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1940 | loss 2.8379 | ppl 17.1 | 1547.9K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 1960 | loss 2.9005 | ppl 18.2 | 1547.4K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 1980 | loss 2.9205 | ppl 18.5 | 1547.4K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2000 | loss 2.9123 | ppl 18.4 | 1546.7K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " >>> eval loss 2.8143 | ppl 16.7 | assistant_tokens 10,736,711 ★ best\n", " step 2020 | loss 2.8443 | ppl 17.2 | 1252.9K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2040 | loss 2.8846 | ppl 17.9 | 1546.2K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2060 | loss 2.8232 | ppl 16.8 | 1546.4K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2080 | loss 2.8884 | ppl 18.0 | 1546.3K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2100 | loss 2.8865 | ppl 17.9 | 1546.0K tok/s | lr 1.97e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2120 | loss 2.8742 | ppl 17.7 | 1546.0K tok/s | lr 1.97e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2140 | loss 2.8662 | ppl 17.6 | 1546.3K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2160 | loss 2.9007 | ppl 18.2 | 1546.3K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2180 | loss 2.8418 | ppl 17.1 | 1547.0K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2200 | loss 2.8228 | ppl 16.8 | 1547.2K tok/s | lr 1.96e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2220 | loss 2.8732 | ppl 17.7 | 1546.4K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2240 | loss 2.8864 | ppl 17.9 | 1547.0K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2260 | loss 2.8804 | ppl 17.8 | 1546.8K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2280 | loss 2.9062 | ppl 18.3 | 1547.2K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2300 | loss 2.9346 | ppl 18.8 | 1547.5K tok/s | lr 1.96e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2320 | loss 2.8534 | ppl 17.3 | 1547.5K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2340 | loss 2.9032 | ppl 18.2 | 1548.1K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2360 | loss 2.8983 | ppl 18.1 | 1549.6K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2380 | loss 2.8739 | ppl 17.7 | 1548.4K tok/s | lr 1.96e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2400 | loss 2.9194 | ppl 18.5 | 1547.2K tok/s | lr 1.95e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2420 | loss 2.8565 | ppl 17.4 | 1547.4K tok/s | lr 1.95e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2440 | loss 2.8874 | ppl 17.9 | 1546.3K tok/s | lr 1.95e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2460 | loss 2.8475 | ppl 17.2 | 1547.1K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2480 | loss 2.8326 | ppl 17.0 | 1546.4K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2500 | loss 2.8397 | ppl 17.1 | 1546.5K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2520 | loss 2.8262 | ppl 16.9 | 1546.1K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2540 | loss 2.8571 | ppl 17.4 | 1545.9K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2560 | loss 2.8354 | ppl 17.0 | 1546.0K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2580 | loss 2.8364 | ppl 17.1 | 1545.6K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2600 | loss 2.8896 | ppl 18.0 | 1545.7K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2620 | loss 2.8159 | ppl 16.7 | 1545.9K tok/s | lr 1.95e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2640 | loss 2.8804 | ppl 17.8 | 1546.2K tok/s | lr 1.94e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2660 | loss 2.9148 | ppl 18.4 | 1546.0K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2680 | loss 2.8645 | ppl 17.5 | 1547.0K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2700 | loss 2.9188 | ppl 18.5 | 1546.3K tok/s | lr 1.94e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2720 | loss 2.8885 | ppl 18.0 | 1546.9K tok/s | lr 1.94e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2740 | loss 2.8529 | ppl 17.3 | 1546.3K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2760 | loss 2.9155 | ppl 18.5 | 1547.2K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2780 | loss 2.8614 | ppl 17.5 | 1546.9K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2800 | loss 2.8911 | ppl 18.0 | 1547.0K tok/s | lr 1.94e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2820 | loss 2.8436 | ppl 17.2 | 1547.3K tok/s | lr 1.94e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2840 | loss 2.8534 | ppl 17.3 | 1547.1K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2860 | loss 2.8581 | ppl 17.4 | 1547.7K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2880 | loss 2.8799 | ppl 17.8 | 1546.9K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2900 | loss 2.8475 | ppl 17.2 | 1546.2K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2920 | loss 2.8422 | ppl 17.2 | 1546.6K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2940 | loss 2.9126 | ppl 18.4 | 1545.9K tok/s | lr 1.93e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 2960 | loss 2.8188 | ppl 16.8 | 1545.6K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 2980 | loss 2.8696 | ppl 17.6 | 1545.7K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3000 | loss 2.8411 | ppl 17.1 | 1545.4K tok/s | lr 1.93e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " >>> eval loss 2.8034 | ppl 16.5 | assistant_tokens 10,736,711 ★ best\n", " step 3020 | loss 2.8770 | ppl 17.8 | 1251.2K tok/s | lr 1.93e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3040 | loss 2.8291 | ppl 16.9 | 1545.3K tok/s | lr 1.92e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3060 | loss 2.8642 | ppl 17.5 | 1545.7K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3080 | loss 2.8777 | ppl 17.8 | 1545.6K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3100 | loss 2.8494 | ppl 17.3 | 1545.4K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3120 | loss 2.8277 | ppl 16.9 | 1545.5K tok/s | lr 1.92e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3140 | loss 2.8786 | ppl 17.8 | 1546.2K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3160 | loss 2.8978 | ppl 18.1 | 1545.6K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3180 | loss 2.8375 | ppl 17.1 | 1546.1K tok/s | lr 1.92e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3200 | loss 2.9170 | ppl 18.5 | 1546.3K tok/s | lr 1.92e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3220 | loss 2.8874 | ppl 17.9 | 1546.4K tok/s | lr 1.91e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3240 | loss 2.8454 | ppl 17.2 | 1545.9K tok/s | lr 1.91e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3260 | loss 2.8984 | ppl 18.1 | 1546.4K tok/s | lr 1.91e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3280 | loss 2.8640 | ppl 17.5 | 1547.2K tok/s | lr 1.91e-03 | asst 8% | VRAM 31440MB (gpu0)\n", " step 3300 | loss 2.8980 | ppl 18.1 | 1547.0K tok/s | lr 1.91e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3320 | loss 2.8727 | ppl 17.7 | 1547.1K tok/s | lr 1.91e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3340 | loss 2.8780 | ppl 17.8 | 1547.0K tok/s | lr 1.91e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3360 | loss 2.8505 | ppl 17.3 | 1546.8K tok/s | lr 1.91e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3380 | loss 2.8785 | ppl 17.8 | 1546.2K tok/s | lr 1.91e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3400 | loss 2.8691 | ppl 17.6 | 1546.0K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3420 | loss 2.8330 | ppl 17.0 | 1545.8K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3440 | loss 2.8918 | ppl 18.0 | 1547.6K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3460 | loss 2.9128 | ppl 18.4 | 1546.9K tok/s | lr 1.90e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3480 | loss 2.8178 | ppl 16.7 | 1546.3K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3500 | loss 2.8150 | ppl 16.7 | 1546.8K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3520 | loss 2.8665 | ppl 17.6 | 1545.9K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3540 | loss 2.8524 | ppl 17.3 | 1546.0K tok/s | lr 1.90e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3560 | loss 2.8497 | ppl 17.3 | 1546.4K tok/s | lr 1.89e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3580 | loss 2.8622 | ppl 17.5 | 1546.9K tok/s | lr 1.89e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3600 | loss 2.9112 | ppl 18.4 | 1547.1K tok/s | lr 1.89e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3620 | loss 2.8308 | ppl 17.0 | 1547.2K tok/s | lr 1.89e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3640 | loss 2.8344 | ppl 17.0 | 1546.9K tok/s | lr 1.89e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3660 | loss 2.8512 | ppl 17.3 | 1546.8K tok/s | lr 1.89e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3680 | loss 2.9137 | ppl 18.4 | 1545.6K tok/s | lr 1.89e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3700 | loss 2.8492 | ppl 17.3 | 1547.0K tok/s | lr 1.89e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3720 | loss 2.8341 | ppl 17.0 | 1547.3K tok/s | lr 1.88e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3740 | loss 2.8315 | ppl 17.0 | 1547.1K tok/s | lr 1.88e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3760 | loss 2.8341 | ppl 17.0 | 1547.2K tok/s | lr 1.88e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3780 | loss 2.8993 | ppl 18.2 | 1548.1K tok/s | lr 1.88e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3800 | loss 2.8325 | ppl 17.0 | 1547.8K tok/s | lr 1.88e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3820 | loss 2.8720 | ppl 17.7 | 1549.3K tok/s | lr 1.88e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3840 | loss 2.8799 | ppl 17.8 | 1547.3K tok/s | lr 1.88e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3860 | loss 2.9128 | ppl 18.4 | 1547.1K tok/s | lr 1.88e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3880 | loss 2.8882 | ppl 18.0 | 1546.7K tok/s | lr 1.87e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3900 | loss 2.8132 | ppl 16.7 | 1546.5K tok/s | lr 1.87e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3920 | loss 2.8073 | ppl 16.6 | 1546.5K tok/s | lr 1.87e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3940 | loss 2.8805 | ppl 17.8 | 1546.5K tok/s | lr 1.87e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " step 3960 | loss 2.9004 | ppl 18.2 | 1546.6K tok/s | lr 1.87e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 3980 | loss 2.8451 | ppl 17.2 | 1546.3K tok/s | lr 1.87e-03 | asst 6% | VRAM 31440MB (gpu0)\n", " step 4000 | loss 2.8237 | ppl 16.8 | 1545.7K tok/s | lr 1.87e-03 | asst 7% | VRAM 31440MB (gpu0)\n", " >>> eval loss 2.7951 | ppl 16.4 | assistant_tokens 10,736,711 ★ best\n", " >>> saved checkpoints/sft/sft_step_0004000.pt\n", " step 4020 | loss 2.9051 | ppl 18.3 | 1248.2K tok/s | lr 1.86e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4040 | loss 2.8224 | ppl 16.8 | 1546.3K tok/s | lr 1.86e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4060 | loss 2.8434 | ppl 17.2 | 1547.3K tok/s | lr 1.86e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4080 | loss 2.8991 | ppl 18.2 | 1546.4K tok/s | lr 1.86e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4100 | loss 2.8207 | ppl 16.8 | 1546.3K tok/s | lr 1.86e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4120 | loss 2.8726 | ppl 17.7 | 1547.5K tok/s | lr 1.86e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4140 | loss 2.8640 | ppl 17.5 | 1547.0K tok/s | lr 1.86e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4160 | loss 2.8609 | ppl 17.5 | 1547.2K tok/s | lr 1.85e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4180 | loss 2.8585 | ppl 17.4 | 1547.8K tok/s | lr 1.85e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4200 | loss 2.8427 | ppl 17.2 | 1547.3K tok/s | lr 1.85e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4220 | loss 2.8616 | ppl 17.5 | 1547.7K tok/s | lr 1.85e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4240 | loss 2.8293 | ppl 16.9 | 1547.5K tok/s | lr 1.85e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4260 | loss 2.8290 | ppl 16.9 | 1547.9K tok/s | lr 1.85e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4280 | loss 2.8902 | ppl 18.0 | 1547.6K tok/s | lr 1.85e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4300 | loss 2.8813 | ppl 17.8 | 1547.8K tok/s | lr 1.84e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4320 | loss 2.9201 | ppl 18.5 | 1547.6K tok/s | lr 1.84e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4340 | loss 2.8220 | ppl 16.8 | 1546.6K tok/s | lr 1.84e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4360 | loss 2.8176 | ppl 16.7 | 1546.7K tok/s | lr 1.84e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4380 | loss 2.8794 | ppl 17.8 | 1545.6K tok/s | lr 1.84e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4400 | loss 2.8733 | ppl 17.7 | 1546.0K tok/s | lr 1.84e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4420 | loss 2.8864 | ppl 17.9 | 1546.1K tok/s | lr 1.84e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4440 | loss 2.8323 | ppl 17.0 | 1545.9K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4460 | loss 2.8482 | ppl 17.3 | 1545.1K tok/s | lr 1.83e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4480 | loss 2.8754 | ppl 17.7 | 1546.0K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4500 | loss 2.7986 | ppl 16.4 | 1545.8K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4520 | loss 2.8518 | ppl 17.3 | 1545.5K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4540 | loss 2.8297 | ppl 16.9 | 1546.0K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4560 | loss 2.8557 | ppl 17.4 | 1545.3K tok/s | lr 1.83e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4580 | loss 2.8715 | ppl 17.7 | 1546.0K tok/s | lr 1.82e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4600 | loss 2.7968 | ppl 16.4 | 1546.7K tok/s | lr 1.82e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4620 | loss 2.8602 | ppl 17.5 | 1546.7K tok/s | lr 1.82e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4640 | loss 2.8855 | ppl 17.9 | 1546.7K tok/s | lr 1.82e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4660 | loss 2.9117 | ppl 18.4 | 1547.0K tok/s | lr 1.82e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4680 | loss 2.8493 | ppl 17.3 | 1546.5K tok/s | lr 1.82e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4700 | loss 2.8904 | ppl 18.0 | 1547.3K tok/s | lr 1.81e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4720 | loss 2.8884 | ppl 18.0 | 1548.0K tok/s | lr 1.81e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4740 | loss 2.9132 | ppl 18.4 | 1548.5K tok/s | lr 1.81e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4760 | loss 2.8925 | ppl 18.0 | 1547.3K tok/s | lr 1.81e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4780 | loss 2.8386 | ppl 17.1 | 1547.2K tok/s | lr 1.81e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4800 | loss 2.8645 | ppl 17.5 | 1546.2K tok/s | lr 1.81e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4820 | loss 2.8464 | ppl 17.2 | 1546.1K tok/s | lr 1.80e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4840 | loss 2.8801 | ppl 17.8 | 1546.2K tok/s | lr 1.80e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4860 | loss 2.9163 | ppl 18.5 | 1546.7K tok/s | lr 1.80e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4880 | loss 2.8829 | ppl 17.9 | 1545.7K tok/s | lr 1.80e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4900 | loss 2.8789 | ppl 17.8 | 1545.6K tok/s | lr 1.80e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4920 | loss 2.8659 | ppl 17.6 | 1545.5K tok/s | lr 1.80e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4940 | loss 2.8247 | ppl 16.9 | 1545.0K tok/s | lr 1.79e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 4960 | loss 2.8680 | ppl 17.6 | 1545.6K tok/s | lr 1.79e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 4980 | loss 2.8815 | ppl 17.8 | 1545.5K tok/s | lr 1.79e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5000 | loss 2.8750 | ppl 17.7 | 1546.2K tok/s | lr 1.79e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7849 | ppl 16.2 | assistant_tokens 10,736,711 ★ best\n", " step 5020 | loss 2.8912 | ppl 18.0 | 1252.1K tok/s | lr 1.79e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5040 | loss 2.8634 | ppl 17.5 | 1545.6K tok/s | lr 1.79e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5060 | loss 2.8574 | ppl 17.4 | 1546.1K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5080 | loss 2.8707 | ppl 17.7 | 1546.4K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5100 | loss 2.8170 | ppl 16.7 | 1547.0K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5120 | loss 2.7931 | ppl 16.3 | 1546.0K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5140 | loss 2.8156 | ppl 16.7 | 1546.3K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5160 | loss 2.8773 | ppl 17.8 | 1547.0K tok/s | lr 1.78e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5180 | loss 2.8417 | ppl 17.1 | 1547.5K tok/s | lr 1.77e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5200 | loss 2.8157 | ppl 16.7 | 1547.5K tok/s | lr 1.77e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5220 | loss 2.8322 | ppl 17.0 | 1547.9K tok/s | lr 1.77e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5240 | loss 2.8499 | ppl 17.3 | 1548.1K tok/s | lr 1.77e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5260 | loss 2.8545 | ppl 17.4 | 1547.6K tok/s | lr 1.77e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5280 | loss 2.8375 | ppl 17.1 | 1546.7K tok/s | lr 1.77e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5300 | loss 2.8612 | ppl 17.5 | 1546.4K tok/s | lr 1.76e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5320 | loss 2.8739 | ppl 17.7 | 1546.8K tok/s | lr 1.76e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5340 | loss 2.8647 | ppl 17.5 | 1546.4K tok/s | lr 1.76e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5360 | loss 2.8205 | ppl 16.8 | 1546.2K tok/s | lr 1.76e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5380 | loss 2.8362 | ppl 17.1 | 1546.3K tok/s | lr 1.76e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5400 | loss 2.8811 | ppl 17.8 | 1546.8K tok/s | lr 1.75e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5420 | loss 2.8129 | ppl 16.7 | 1546.4K tok/s | lr 1.75e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5440 | loss 2.8324 | ppl 17.0 | 1546.8K tok/s | lr 1.75e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5460 | loss 2.8248 | ppl 16.9 | 1546.9K tok/s | lr 1.75e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5480 | loss 2.8733 | ppl 17.7 | 1546.1K tok/s | lr 1.75e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5500 | loss 2.8315 | ppl 17.0 | 1546.2K tok/s | lr 1.75e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5520 | loss 2.8452 | ppl 17.2 | 1547.3K tok/s | lr 1.74e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5540 | loss 2.8547 | ppl 17.4 | 1546.7K tok/s | lr 1.74e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5560 | loss 2.8606 | ppl 17.5 | 1547.2K tok/s | lr 1.74e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5580 | loss 2.8563 | ppl 17.4 | 1547.4K tok/s | lr 1.74e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5600 | loss 2.8666 | ppl 17.6 | 1547.6K tok/s | lr 1.74e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5620 | loss 2.8917 | ppl 18.0 | 1548.7K tok/s | lr 1.73e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5640 | loss 2.8762 | ppl 17.7 | 1548.5K tok/s | lr 1.73e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5660 | loss 2.8387 | ppl 17.1 | 1548.6K tok/s | lr 1.73e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5680 | loss 2.8195 | ppl 16.8 | 1548.5K tok/s | lr 1.73e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5700 | loss 2.8067 | ppl 16.6 | 1548.6K tok/s | lr 1.73e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5720 | loss 2.8553 | ppl 17.4 | 1547.6K tok/s | lr 1.72e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5740 | loss 2.8561 | ppl 17.4 | 1547.0K tok/s | lr 1.72e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5760 | loss 2.8210 | ppl 16.8 | 1547.2K tok/s | lr 1.72e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5780 | loss 2.8295 | ppl 16.9 | 1546.5K tok/s | lr 1.72e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5800 | loss 2.8763 | ppl 17.7 | 1546.1K tok/s | lr 1.72e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 5820 | loss 2.8376 | ppl 17.1 | 1546.5K tok/s | lr 1.72e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5840 | loss 2.8530 | ppl 17.3 | 1546.6K tok/s | lr 1.71e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5860 | loss 2.8694 | ppl 17.6 | 1546.4K tok/s | lr 1.71e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5880 | loss 2.8781 | ppl 17.8 | 1545.9K tok/s | lr 1.71e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5900 | loss 2.8042 | ppl 16.5 | 1546.7K tok/s | lr 1.71e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5920 | loss 2.8263 | ppl 16.9 | 1545.6K tok/s | lr 1.71e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5940 | loss 2.8825 | ppl 17.9 | 1546.3K tok/s | lr 1.70e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5960 | loss 2.9041 | ppl 18.2 | 1546.8K tok/s | lr 1.70e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 5980 | loss 2.8784 | ppl 17.8 | 1546.0K tok/s | lr 1.70e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6000 | loss 2.8464 | ppl 17.2 | 1546.4K tok/s | lr 1.70e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7869 | ppl 16.2 | assistant_tokens 10,736,711\n", " step 6020 | loss 2.8358 | ppl 17.0 | 1252.9K tok/s | lr 1.70e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6040 | loss 2.8410 | ppl 17.1 | 1546.8K tok/s | lr 1.69e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6060 | loss 2.8729 | ppl 17.7 | 1546.8K tok/s | lr 1.69e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6080 | loss 2.8319 | ppl 17.0 | 1547.3K tok/s | lr 1.69e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6100 | loss 2.8760 | ppl 17.7 | 1547.1K tok/s | lr 1.69e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6120 | loss 2.8387 | ppl 17.1 | 1547.1K tok/s | lr 1.69e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6140 | loss 2.8777 | ppl 17.8 | 1547.8K tok/s | lr 1.68e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6160 | loss 2.8344 | ppl 17.0 | 1547.0K tok/s | lr 1.68e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6180 | loss 2.8772 | ppl 17.8 | 1546.8K tok/s | lr 1.68e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6200 | loss 2.7935 | ppl 16.3 | 1547.0K tok/s | lr 1.68e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6220 | loss 2.8360 | ppl 17.0 | 1546.4K tok/s | lr 1.68e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6240 | loss 2.8458 | ppl 17.2 | 1547.4K tok/s | lr 1.67e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6260 | loss 2.8621 | ppl 17.5 | 1546.0K tok/s | lr 1.67e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6280 | loss 2.8711 | ppl 17.7 | 1546.3K tok/s | lr 1.67e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6300 | loss 2.8632 | ppl 17.5 | 1545.8K tok/s | lr 1.67e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6320 | loss 2.8741 | ppl 17.7 | 1545.9K tok/s | lr 1.67e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6340 | loss 2.8878 | ppl 18.0 | 1546.1K tok/s | lr 1.66e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6360 | loss 2.8115 | ppl 16.6 | 1545.0K tok/s | lr 1.66e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6380 | loss 2.8643 | ppl 17.5 | 1546.1K tok/s | lr 1.66e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6400 | loss 2.8926 | ppl 18.0 | 1545.9K tok/s | lr 1.66e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6420 | loss 2.9168 | ppl 18.5 | 1546.1K tok/s | lr 1.66e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6440 | loss 2.8418 | ppl 17.1 | 1545.9K tok/s | lr 1.65e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6460 | loss 2.8710 | ppl 17.7 | 1545.7K tok/s | lr 1.65e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6480 | loss 2.8305 | ppl 17.0 | 1546.8K tok/s | lr 1.65e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6500 | loss 2.7961 | ppl 16.4 | 1546.7K tok/s | lr 1.65e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6520 | loss 2.8747 | ppl 17.7 | 1546.4K tok/s | lr 1.64e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6540 | loss 2.8124 | ppl 16.6 | 1546.5K tok/s | lr 1.64e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6560 | loss 2.8867 | ppl 17.9 | 1547.5K tok/s | lr 1.64e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6580 | loss 2.8251 | ppl 16.9 | 1547.2K tok/s | lr 1.64e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6600 | loss 2.7988 | ppl 16.4 | 1547.6K tok/s | lr 1.64e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6620 | loss 2.8538 | ppl 17.4 | 1548.1K tok/s | lr 1.63e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6640 | loss 2.8043 | ppl 16.5 | 1546.7K tok/s | lr 1.63e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6660 | loss 2.8437 | ppl 17.2 | 1546.9K tok/s | lr 1.63e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6680 | loss 2.9008 | ppl 18.2 | 1546.5K tok/s | lr 1.63e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6700 | loss 2.7926 | ppl 16.3 | 1546.2K tok/s | lr 1.63e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6720 | loss 2.8189 | ppl 16.8 | 1546.2K tok/s | lr 1.62e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6740 | loss 2.8773 | ppl 17.8 | 1546.4K tok/s | lr 1.62e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6760 | loss 2.8363 | ppl 17.1 | 1546.3K tok/s | lr 1.62e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6780 | loss 2.8250 | ppl 16.9 | 1545.6K tok/s | lr 1.62e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6800 | loss 2.8791 | ppl 17.8 | 1546.0K tok/s | lr 1.61e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6820 | loss 2.8563 | ppl 17.4 | 1545.1K tok/s | lr 1.61e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6840 | loss 2.7979 | ppl 16.4 | 1545.6K tok/s | lr 1.61e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6860 | loss 2.8536 | ppl 17.4 | 1546.0K tok/s | lr 1.61e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6880 | loss 2.7974 | ppl 16.4 | 1545.5K tok/s | lr 1.61e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6900 | loss 2.8971 | ppl 18.1 | 1545.6K tok/s | lr 1.60e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6920 | loss 2.8436 | ppl 17.2 | 1545.7K tok/s | lr 1.60e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6940 | loss 2.7803 | ppl 16.1 | 1545.9K tok/s | lr 1.60e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 6960 | loss 2.8105 | ppl 16.6 | 1546.5K tok/s | lr 1.60e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 6980 | loss 2.8107 | ppl 16.6 | 1547.1K tok/s | lr 1.59e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7000 | loss 2.8352 | ppl 17.0 | 1548.2K tok/s | lr 1.59e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7871 | ppl 16.2 | assistant_tokens 10,736,711\n", " step 7020 | loss 2.8590 | ppl 17.4 | 1254.1K tok/s | lr 1.59e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7040 | loss 2.8557 | ppl 17.4 | 1547.5K tok/s | lr 1.59e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7060 | loss 2.8318 | ppl 17.0 | 1548.7K tok/s | lr 1.59e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7080 | loss 2.8111 | ppl 16.6 | 1548.9K tok/s | lr 1.58e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7100 | loss 2.8078 | ppl 16.6 | 1548.2K tok/s | lr 1.58e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7120 | loss 2.8566 | ppl 17.4 | 1547.5K tok/s | lr 1.58e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7140 | loss 2.8779 | ppl 17.8 | 1547.1K tok/s | lr 1.58e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7160 | loss 2.8396 | ppl 17.1 | 1547.1K tok/s | lr 1.57e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7180 | loss 2.8517 | ppl 17.3 | 1546.7K tok/s | lr 1.57e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7200 | loss 2.8085 | ppl 16.6 | 1547.2K tok/s | lr 1.57e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7220 | loss 2.8405 | ppl 17.1 | 1545.7K tok/s | lr 1.57e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7240 | loss 2.8090 | ppl 16.6 | 1546.1K tok/s | lr 1.57e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7260 | loss 2.8431 | ppl 17.2 | 1546.3K tok/s | lr 1.56e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7280 | loss 2.8397 | ppl 17.1 | 1546.7K tok/s | lr 1.56e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7300 | loss 2.8443 | ppl 17.2 | 1546.4K tok/s | lr 1.56e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7320 | loss 2.8404 | ppl 17.1 | 1546.3K tok/s | lr 1.56e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7340 | loss 2.8522 | ppl 17.3 | 1546.9K tok/s | lr 1.55e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7360 | loss 2.7887 | ppl 16.3 | 1546.8K tok/s | lr 1.55e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7380 | loss 2.8550 | ppl 17.4 | 1546.3K tok/s | lr 1.55e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7400 | loss 2.7841 | ppl 16.2 | 1546.6K tok/s | lr 1.55e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7420 | loss 2.8165 | ppl 16.7 | 1546.9K tok/s | lr 1.55e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7440 | loss 2.8729 | ppl 17.7 | 1546.7K tok/s | lr 1.54e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7460 | loss 2.8275 | ppl 16.9 | 1547.3K tok/s | lr 1.54e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7480 | loss 2.8213 | ppl 16.8 | 1547.8K tok/s | lr 1.54e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7500 | loss 2.8922 | ppl 18.0 | 1547.4K tok/s | lr 1.54e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7520 | loss 2.8585 | ppl 17.4 | 1547.7K tok/s | lr 1.53e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7540 | loss 2.9241 | ppl 18.6 | 1548.2K tok/s | lr 1.53e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7560 | loss 2.8592 | ppl 17.4 | 1548.6K tok/s | lr 1.53e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7580 | loss 2.8541 | ppl 17.4 | 1548.3K tok/s | lr 1.53e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7600 | loss 2.8137 | ppl 16.7 | 1547.4K tok/s | lr 1.52e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7620 | loss 2.8036 | ppl 16.5 | 1546.6K tok/s | lr 1.52e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7640 | loss 2.8602 | ppl 17.5 | 1546.9K tok/s | lr 1.52e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7660 | loss 2.8122 | ppl 16.6 | 1546.2K tok/s | lr 1.52e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7680 | loss 2.8851 | ppl 17.9 | 1546.5K tok/s | lr 1.51e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7700 | loss 2.8453 | ppl 17.2 | 1546.0K tok/s | lr 1.51e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7720 | loss 2.8660 | ppl 17.6 | 1545.5K tok/s | lr 1.51e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7740 | loss 2.8917 | ppl 18.0 | 1546.2K tok/s | lr 1.51e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7760 | loss 2.8709 | ppl 17.7 | 1545.5K tok/s | lr 1.51e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7780 | loss 2.8434 | ppl 17.2 | 1546.2K tok/s | lr 1.50e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7800 | loss 2.8892 | ppl 18.0 | 1546.0K tok/s | lr 1.50e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7820 | loss 2.8069 | ppl 16.6 | 1546.0K tok/s | lr 1.50e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7840 | loss 2.8216 | ppl 16.8 | 1545.7K tok/s | lr 1.50e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7860 | loss 2.8219 | ppl 16.8 | 1546.2K tok/s | lr 1.49e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7880 | loss 2.8575 | ppl 17.4 | 1545.9K tok/s | lr 1.49e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7900 | loss 2.8961 | ppl 18.1 | 1545.9K tok/s | lr 1.49e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7920 | loss 2.8021 | ppl 16.5 | 1546.7K tok/s | lr 1.49e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7940 | loss 2.8414 | ppl 17.1 | 1547.6K tok/s | lr 1.48e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 7960 | loss 2.8140 | ppl 16.7 | 1547.3K tok/s | lr 1.48e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 7980 | loss 2.8809 | ppl 17.8 | 1547.2K tok/s | lr 1.48e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8000 | loss 2.8091 | ppl 16.6 | 1547.9K tok/s | lr 1.48e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7768 | ppl 16.1 | assistant_tokens 10,736,711 ★ best\n", " >>> saved checkpoints/sft/sft_step_0008000.pt\n", " step 8020 | loss 2.8270 | ppl 16.9 | 1250.6K tok/s | lr 1.47e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8040 | loss 2.8468 | ppl 17.2 | 1547.9K tok/s | lr 1.47e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8060 | loss 2.8194 | ppl 16.8 | 1546.8K tok/s | lr 1.47e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8080 | loss 2.7654 | ppl 15.9 | 1547.2K tok/s | lr 1.47e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8100 | loss 2.8049 | ppl 16.5 | 1547.9K tok/s | lr 1.46e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8120 | loss 2.8663 | ppl 17.6 | 1546.8K tok/s | lr 1.46e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8140 | loss 2.8472 | ppl 17.2 | 1546.3K tok/s | lr 1.46e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8160 | loss 2.8735 | ppl 17.7 | 1546.1K tok/s | lr 1.46e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8180 | loss 2.8392 | ppl 17.1 | 1545.6K tok/s | lr 1.45e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8200 | loss 2.8329 | ppl 17.0 | 1546.1K tok/s | lr 1.45e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8220 | loss 2.8536 | ppl 17.4 | 1546.3K tok/s | lr 1.45e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8240 | loss 2.8412 | ppl 17.1 | 1545.9K tok/s | lr 1.45e-03 | asst 8% | VRAM 31442MB (gpu0)\n", " step 8260 | loss 2.8170 | ppl 16.7 | 1546.3K tok/s | lr 1.44e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8280 | loss 2.8583 | ppl 17.4 | 1546.3K tok/s | lr 1.44e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8300 | loss 2.8410 | ppl 17.1 | 1546.5K tok/s | lr 1.44e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8320 | loss 2.8197 | ppl 16.8 | 1545.9K tok/s | lr 1.44e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8340 | loss 2.8311 | ppl 17.0 | 1545.9K tok/s | lr 1.43e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8360 | loss 2.8104 | ppl 16.6 | 1546.3K tok/s | lr 1.43e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8380 | loss 2.8299 | ppl 16.9 | 1547.2K tok/s | lr 1.43e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8400 | loss 2.8222 | ppl 16.8 | 1546.8K tok/s | lr 1.43e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8420 | loss 2.8282 | ppl 16.9 | 1548.2K tok/s | lr 1.42e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8440 | loss 2.8228 | ppl 16.8 | 1546.9K tok/s | lr 1.42e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8460 | loss 2.8030 | ppl 16.5 | 1547.5K tok/s | lr 1.42e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8480 | loss 2.8481 | ppl 17.3 | 1548.2K tok/s | lr 1.42e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8500 | loss 2.8193 | ppl 16.8 | 1548.7K tok/s | lr 1.41e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8520 | loss 2.8440 | ppl 17.2 | 1547.9K tok/s | lr 1.41e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8540 | loss 2.8247 | ppl 16.9 | 1546.9K tok/s | lr 1.41e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8560 | loss 2.8502 | ppl 17.3 | 1546.7K tok/s | lr 1.41e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8580 | loss 2.8919 | ppl 18.0 | 1546.6K tok/s | lr 1.40e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8600 | loss 2.8335 | ppl 17.0 | 1546.9K tok/s | lr 1.40e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8620 | loss 2.8181 | ppl 16.7 | 1546.7K tok/s | lr 1.40e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8640 | loss 2.8384 | ppl 17.1 | 1546.0K tok/s | lr 1.40e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8660 | loss 2.8309 | ppl 17.0 | 1546.9K tok/s | lr 1.39e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8680 | loss 2.8273 | ppl 16.9 | 1545.5K tok/s | lr 1.39e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8700 | loss 2.8270 | ppl 16.9 | 1546.7K tok/s | lr 1.39e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8720 | loss 2.7923 | ppl 16.3 | 1546.4K tok/s | lr 1.39e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8740 | loss 2.8428 | ppl 17.2 | 1546.9K tok/s | lr 1.38e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8760 | loss 2.8188 | ppl 16.8 | 1547.0K tok/s | lr 1.38e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8780 | loss 2.7994 | ppl 16.4 | 1547.0K tok/s | lr 1.38e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8800 | loss 2.8667 | ppl 17.6 | 1547.5K tok/s | lr 1.38e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8820 | loss 2.8221 | ppl 16.8 | 1547.0K tok/s | lr 1.37e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8840 | loss 2.8206 | ppl 16.8 | 1547.6K tok/s | lr 1.37e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8860 | loss 2.8432 | ppl 17.2 | 1547.4K tok/s | lr 1.37e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8880 | loss 2.8222 | ppl 16.8 | 1547.7K tok/s | lr 1.37e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8900 | loss 2.8186 | ppl 16.8 | 1547.7K tok/s | lr 1.36e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8920 | loss 2.8274 | ppl 16.9 | 1548.7K tok/s | lr 1.36e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8940 | loss 2.8423 | ppl 17.2 | 1549.5K tok/s | lr 1.36e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 8960 | loss 2.8127 | ppl 16.7 | 1549.5K tok/s | lr 1.36e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 8980 | loss 2.8800 | ppl 17.8 | 1549.6K tok/s | lr 1.35e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9000 | loss 2.7805 | ppl 16.1 | 1548.8K tok/s | lr 1.35e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7697 | ppl 16.0 | assistant_tokens 10,736,711 ★ best\n", " step 9020 | loss 2.8304 | ppl 17.0 | 1254.2K tok/s | lr 1.35e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9040 | loss 2.8135 | ppl 16.7 | 1547.3K tok/s | lr 1.35e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9060 | loss 2.8419 | ppl 17.1 | 1547.4K tok/s | lr 1.34e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9080 | loss 2.8372 | ppl 17.1 | 1547.3K tok/s | lr 1.34e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9100 | loss 2.8647 | ppl 17.5 | 1546.2K tok/s | lr 1.34e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9120 | loss 2.8093 | ppl 16.6 | 1546.4K tok/s | lr 1.34e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9140 | loss 2.8508 | ppl 17.3 | 1546.3K tok/s | lr 1.33e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9160 | loss 2.8402 | ppl 17.1 | 1546.3K tok/s | lr 1.33e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9180 | loss 2.9029 | ppl 18.2 | 1548.0K tok/s | lr 1.33e-03 | asst 5% | VRAM 31442MB (gpu0)\n", " step 9200 | loss 2.8075 | ppl 16.6 | 1546.2K tok/s | lr 1.33e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9220 | loss 2.8774 | ppl 17.8 | 1546.3K tok/s | lr 1.32e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9240 | loss 2.8092 | ppl 16.6 | 1546.6K tok/s | lr 1.32e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9260 | loss 2.8609 | ppl 17.5 | 1546.8K tok/s | lr 1.32e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9280 | loss 2.8053 | ppl 16.5 | 1546.3K tok/s | lr 1.31e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9300 | loss 2.8141 | ppl 16.7 | 1546.8K tok/s | lr 1.31e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9320 | loss 2.7905 | ppl 16.3 | 1546.8K tok/s | lr 1.31e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9340 | loss 2.8360 | ppl 17.0 | 1546.8K tok/s | lr 1.31e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9360 | loss 2.8983 | ppl 18.1 | 1547.3K tok/s | lr 1.30e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9380 | loss 2.8750 | ppl 17.7 | 1547.5K tok/s | lr 1.30e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9400 | loss 2.8810 | ppl 17.8 | 1547.5K tok/s | lr 1.30e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9420 | loss 2.8530 | ppl 17.3 | 1548.1K tok/s | lr 1.30e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9440 | loss 2.8013 | ppl 16.5 | 1548.6K tok/s | lr 1.29e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9460 | loss 2.7713 | ppl 16.0 | 1548.8K tok/s | lr 1.29e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9480 | loss 2.7776 | ppl 16.1 | 1548.2K tok/s | lr 1.29e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9500 | loss 2.8219 | ppl 16.8 | 1547.8K tok/s | lr 1.29e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9520 | loss 2.8138 | ppl 16.7 | 1547.5K tok/s | lr 1.28e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9540 | loss 2.8576 | ppl 17.4 | 1546.6K tok/s | lr 1.28e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9560 | loss 2.8249 | ppl 16.9 | 1546.4K tok/s | lr 1.28e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9580 | loss 2.7917 | ppl 16.3 | 1545.7K tok/s | lr 1.28e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9600 | loss 2.8003 | ppl 16.4 | 1546.1K tok/s | lr 1.27e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9620 | loss 2.7682 | ppl 15.9 | 1545.6K tok/s | lr 1.27e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9640 | loss 2.8501 | ppl 17.3 | 1545.7K tok/s | lr 1.27e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9660 | loss 2.7932 | ppl 16.3 | 1545.2K tok/s | lr 1.26e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9680 | loss 2.8669 | ppl 17.6 | 1545.7K tok/s | lr 1.26e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9700 | loss 2.8778 | ppl 17.8 | 1545.6K tok/s | lr 1.26e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9720 | loss 2.8406 | ppl 17.1 | 1546.6K tok/s | lr 1.26e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9740 | loss 2.8066 | ppl 16.6 | 1546.4K tok/s | lr 1.25e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9760 | loss 2.8216 | ppl 16.8 | 1546.2K tok/s | lr 1.25e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9780 | loss 2.8169 | ppl 16.7 | 1546.1K tok/s | lr 1.25e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9800 | loss 2.8514 | ppl 17.3 | 1546.3K tok/s | lr 1.25e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9820 | loss 2.7840 | ppl 16.2 | 1546.8K tok/s | lr 1.24e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9840 | loss 2.8086 | ppl 16.6 | 1547.1K tok/s | lr 1.24e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9860 | loss 2.7504 | ppl 15.6 | 1547.6K tok/s | lr 1.24e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9880 | loss 2.9044 | ppl 18.3 | 1547.8K tok/s | lr 1.24e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9900 | loss 2.8283 | ppl 16.9 | 1549.2K tok/s | lr 1.23e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 9920 | loss 2.8144 | ppl 16.7 | 1547.6K tok/s | lr 1.23e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9940 | loss 2.8618 | ppl 17.5 | 1548.4K tok/s | lr 1.23e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9960 | loss 2.8170 | ppl 16.7 | 1547.2K tok/s | lr 1.22e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 9980 | loss 2.8571 | ppl 17.4 | 1547.2K tok/s | lr 1.22e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10000 | loss 2.8098 | ppl 16.6 | 1546.2K tok/s | lr 1.22e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7812 | ppl 16.1 | assistant_tokens 10,736,711\n", " step 10020 | loss 2.8711 | ppl 17.7 | 1252.2K tok/s | lr 1.22e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10040 | loss 2.8578 | ppl 17.4 | 1546.1K tok/s | lr 1.21e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10060 | loss 2.8322 | ppl 17.0 | 1546.1K tok/s | lr 1.21e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10080 | loss 2.8126 | ppl 16.7 | 1545.5K tok/s | lr 1.21e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10100 | loss 2.8262 | ppl 16.9 | 1545.8K tok/s | lr 1.21e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10120 | loss 2.8097 | ppl 16.6 | 1545.6K tok/s | lr 1.20e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10140 | loss 2.8466 | ppl 17.2 | 1546.0K tok/s | lr 1.20e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10160 | loss 2.7866 | ppl 16.2 | 1546.1K tok/s | lr 1.20e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10180 | loss 2.8480 | ppl 17.3 | 1546.2K tok/s | lr 1.20e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10200 | loss 2.8119 | ppl 16.6 | 1546.1K tok/s | lr 1.19e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10220 | loss 2.8402 | ppl 17.1 | 1545.5K tok/s | lr 1.19e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10240 | loss 2.8413 | ppl 17.1 | 1546.0K tok/s | lr 1.19e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10260 | loss 2.8374 | ppl 17.1 | 1546.6K tok/s | lr 1.18e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10280 | loss 2.7981 | ppl 16.4 | 1546.3K tok/s | lr 1.18e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10300 | loss 2.8419 | ppl 17.1 | 1546.8K tok/s | lr 1.18e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10320 | loss 2.7969 | ppl 16.4 | 1546.9K tok/s | lr 1.18e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10340 | loss 2.8138 | ppl 16.7 | 1547.2K tok/s | lr 1.17e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10360 | loss 2.8125 | ppl 16.7 | 1547.0K tok/s | lr 1.17e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10380 | loss 2.8271 | ppl 16.9 | 1548.1K tok/s | lr 1.17e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10400 | loss 2.8862 | ppl 17.9 | 1548.2K tok/s | lr 1.17e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10420 | loss 2.8669 | ppl 17.6 | 1547.4K tok/s | lr 1.16e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10440 | loss 2.7968 | ppl 16.4 | 1546.7K tok/s | lr 1.16e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10460 | loss 2.7785 | ppl 16.1 | 1546.7K tok/s | lr 1.16e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10480 | loss 2.8055 | ppl 16.5 | 1546.5K tok/s | lr 1.15e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10500 | loss 2.7864 | ppl 16.2 | 1546.3K tok/s | lr 1.15e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10520 | loss 2.8410 | ppl 17.1 | 1546.5K tok/s | lr 1.15e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10540 | loss 2.8234 | ppl 16.8 | 1546.5K tok/s | lr 1.15e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10560 | loss 2.8259 | ppl 16.9 | 1546.0K tok/s | lr 1.14e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10580 | loss 2.8134 | ppl 16.7 | 1545.7K tok/s | lr 1.14e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10600 | loss 2.8490 | ppl 17.3 | 1546.5K tok/s | lr 1.14e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10620 | loss 2.8176 | ppl 16.7 | 1546.5K tok/s | lr 1.14e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10640 | loss 2.8126 | ppl 16.7 | 1545.5K tok/s | lr 1.13e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10660 | loss 2.8109 | ppl 16.6 | 1546.0K tok/s | lr 1.13e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10680 | loss 2.8639 | ppl 17.5 | 1545.8K tok/s | lr 1.13e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10700 | loss 2.8786 | ppl 17.8 | 1545.7K tok/s | lr 1.12e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10720 | loss 2.8214 | ppl 16.8 | 1546.8K tok/s | lr 1.12e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10740 | loss 2.8252 | ppl 16.9 | 1547.4K tok/s | lr 1.12e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10760 | loss 2.8435 | ppl 17.2 | 1547.3K tok/s | lr 1.12e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10780 | loss 2.8099 | ppl 16.6 | 1547.2K tok/s | lr 1.11e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10800 | loss 2.7633 | ppl 15.9 | 1546.8K tok/s | lr 1.11e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10820 | loss 2.8305 | ppl 17.0 | 1548.0K tok/s | lr 1.11e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10840 | loss 2.8090 | ppl 16.6 | 1548.1K tok/s | lr 1.11e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10860 | loss 2.8188 | ppl 16.8 | 1548.8K tok/s | lr 1.10e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10880 | loss 2.8505 | ppl 17.3 | 1548.3K tok/s | lr 1.10e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10900 | loss 2.8249 | ppl 16.9 | 1547.8K tok/s | lr 1.10e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10920 | loss 2.8111 | ppl 16.6 | 1547.2K tok/s | lr 1.09e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10940 | loss 2.8160 | ppl 16.7 | 1546.6K tok/s | lr 1.09e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 10960 | loss 2.8178 | ppl 16.7 | 1546.5K tok/s | lr 1.09e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 10980 | loss 2.7914 | ppl 16.3 | 1545.8K tok/s | lr 1.09e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11000 | loss 2.8581 | ppl 17.4 | 1546.3K tok/s | lr 1.08e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7658 | ppl 15.9 | assistant_tokens 10,736,711 ★ best\n", " step 11020 | loss 2.8015 | ppl 16.5 | 1252.1K tok/s | lr 1.08e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11040 | loss 2.8914 | ppl 18.0 | 1546.1K tok/s | lr 1.08e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11060 | loss 2.8093 | ppl 16.6 | 1546.6K tok/s | lr 1.07e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11080 | loss 2.8387 | ppl 17.1 | 1545.4K tok/s | lr 1.07e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11100 | loss 2.8326 | ppl 17.0 | 1546.0K tok/s | lr 1.07e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11120 | loss 2.8667 | ppl 17.6 | 1546.2K tok/s | lr 1.07e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11140 | loss 2.8229 | ppl 16.8 | 1546.2K tok/s | lr 1.06e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11160 | loss 2.8155 | ppl 16.7 | 1546.6K tok/s | lr 1.06e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11180 | loss 2.8324 | ppl 17.0 | 1546.5K tok/s | lr 1.06e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11200 | loss 2.8442 | ppl 17.2 | 1546.7K tok/s | lr 1.06e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11220 | loss 2.8272 | ppl 16.9 | 1546.5K tok/s | lr 1.05e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11240 | loss 2.7872 | ppl 16.2 | 1547.5K tok/s | lr 1.05e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11260 | loss 2.8991 | ppl 18.2 | 1546.2K tok/s | lr 1.05e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11280 | loss 2.8125 | ppl 16.7 | 1547.1K tok/s | lr 1.04e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11300 | loss 2.7788 | ppl 16.1 | 1546.2K tok/s | lr 1.04e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11320 | loss 2.8185 | ppl 16.8 | 1547.3K tok/s | lr 1.04e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11340 | loss 2.8033 | ppl 16.5 | 1547.4K tok/s | lr 1.04e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11360 | loss 2.8168 | ppl 16.7 | 1547.1K tok/s | lr 1.03e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11380 | loss 2.8026 | ppl 16.5 | 1546.5K tok/s | lr 1.03e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11400 | loss 2.8192 | ppl 16.8 | 1546.4K tok/s | lr 1.03e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11420 | loss 2.8070 | ppl 16.6 | 1547.4K tok/s | lr 1.03e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11440 | loss 2.8323 | ppl 17.0 | 1546.7K tok/s | lr 1.02e-03 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11460 | loss 2.7925 | ppl 16.3 | 1545.8K tok/s | lr 1.02e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11480 | loss 2.8093 | ppl 16.6 | 1546.1K tok/s | lr 1.02e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11500 | loss 2.8375 | ppl 17.1 | 1545.6K tok/s | lr 1.01e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11520 | loss 2.8736 | ppl 17.7 | 1546.2K tok/s | lr 1.01e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11540 | loss 2.7647 | ppl 15.9 | 1545.9K tok/s | lr 1.01e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11560 | loss 2.8158 | ppl 16.7 | 1546.0K tok/s | lr 1.01e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11580 | loss 2.8561 | ppl 17.4 | 1545.7K tok/s | lr 1.00e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11600 | loss 2.8382 | ppl 17.1 | 1546.0K tok/s | lr 1.00e-03 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11620 | loss 2.8025 | ppl 16.5 | 1545.7K tok/s | lr 9.98e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11640 | loss 2.7893 | ppl 16.3 | 1546.3K tok/s | lr 9.95e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11660 | loss 2.8364 | ppl 17.1 | 1546.4K tok/s | lr 9.92e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11680 | loss 2.8315 | ppl 17.0 | 1546.6K tok/s | lr 9.90e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11700 | loss 2.8416 | ppl 17.1 | 1546.5K tok/s | lr 9.87e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11720 | loss 2.7913 | ppl 16.3 | 1546.0K tok/s | lr 9.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11740 | loss 2.8111 | ppl 16.6 | 1546.8K tok/s | lr 9.81e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11760 | loss 2.7752 | ppl 16.0 | 1547.1K tok/s | lr 9.79e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11780 | loss 2.8049 | ppl 16.5 | 1547.7K tok/s | lr 9.76e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11800 | loss 2.7894 | ppl 16.3 | 1548.2K tok/s | lr 9.73e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11820 | loss 2.8726 | ppl 17.7 | 1549.1K tok/s | lr 9.70e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11840 | loss 2.8583 | ppl 17.4 | 1548.3K tok/s | lr 9.68e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11860 | loss 2.8319 | ppl 17.0 | 1547.4K tok/s | lr 9.65e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11880 | loss 2.8067 | ppl 16.6 | 1546.5K tok/s | lr 9.62e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11900 | loss 2.8215 | ppl 16.8 | 1546.6K tok/s | lr 9.59e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11920 | loss 2.7782 | ppl 16.1 | 1545.5K tok/s | lr 9.57e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 11940 | loss 2.8043 | ppl 16.5 | 1545.6K tok/s | lr 9.54e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11960 | loss 2.8402 | ppl 17.1 | 1545.7K tok/s | lr 9.51e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 11980 | loss 2.8428 | ppl 17.2 | 1545.3K tok/s | lr 9.48e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12000 | loss 2.7878 | ppl 16.2 | 1545.7K tok/s | lr 9.46e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7581 | ppl 15.8 | assistant_tokens 10,736,711 ★ best\n", " >>> saved checkpoints/sft/sft_step_0012000.pt\n", " step 12020 | loss 2.8162 | ppl 16.7 | 1247.9K tok/s | lr 9.43e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12040 | loss 2.8275 | ppl 16.9 | 1545.2K tok/s | lr 9.40e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12060 | loss 2.7972 | ppl 16.4 | 1545.5K tok/s | lr 9.37e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12080 | loss 2.8459 | ppl 17.2 | 1546.0K tok/s | lr 9.35e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12100 | loss 2.8413 | ppl 17.1 | 1545.9K tok/s | lr 9.32e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12120 | loss 2.7997 | ppl 16.4 | 1545.6K tok/s | lr 9.29e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12140 | loss 2.8585 | ppl 17.4 | 1546.0K tok/s | lr 9.26e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12160 | loss 2.8648 | ppl 17.5 | 1546.8K tok/s | lr 9.24e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12180 | loss 2.8227 | ppl 16.8 | 1546.3K tok/s | lr 9.21e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12200 | loss 2.8401 | ppl 17.1 | 1546.9K tok/s | lr 9.18e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12220 | loss 2.8460 | ppl 17.2 | 1547.9K tok/s | lr 9.15e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12240 | loss 2.8322 | ppl 17.0 | 1548.4K tok/s | lr 9.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12260 | loss 2.8702 | ppl 17.6 | 1548.8K tok/s | lr 9.10e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12280 | loss 2.8185 | ppl 16.8 | 1549.2K tok/s | lr 9.07e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12300 | loss 2.8417 | ppl 17.1 | 1548.8K tok/s | lr 9.04e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12320 | loss 2.7416 | ppl 15.5 | 1548.3K tok/s | lr 9.02e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12340 | loss 2.8065 | ppl 16.6 | 1548.1K tok/s | lr 8.99e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12360 | loss 2.8323 | ppl 17.0 | 1546.9K tok/s | lr 8.96e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12380 | loss 2.7936 | ppl 16.3 | 1547.2K tok/s | lr 8.93e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12400 | loss 2.8704 | ppl 17.6 | 1546.4K tok/s | lr 8.91e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12420 | loss 2.8356 | ppl 17.0 | 1545.9K tok/s | lr 8.88e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12440 | loss 2.8222 | ppl 16.8 | 1545.9K tok/s | lr 8.85e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12460 | loss 2.8186 | ppl 16.8 | 1546.0K tok/s | lr 8.83e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12480 | loss 2.8191 | ppl 16.8 | 1545.9K tok/s | lr 8.80e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12500 | loss 2.8240 | ppl 16.8 | 1546.2K tok/s | lr 8.77e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12520 | loss 2.8054 | ppl 16.5 | 1545.8K tok/s | lr 8.74e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12540 | loss 2.8248 | ppl 16.9 | 1545.9K tok/s | lr 8.72e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12560 | loss 2.8329 | ppl 17.0 | 1546.4K tok/s | lr 8.69e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12580 | loss 2.8250 | ppl 16.9 | 1546.1K tok/s | lr 8.66e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12600 | loss 2.8348 | ppl 17.0 | 1546.6K tok/s | lr 8.63e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12620 | loss 2.8224 | ppl 16.8 | 1546.6K tok/s | lr 8.61e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12640 | loss 2.8493 | ppl 17.3 | 1546.2K tok/s | lr 8.58e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12660 | loss 2.7696 | ppl 16.0 | 1547.1K tok/s | lr 8.55e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12680 | loss 2.8262 | ppl 16.9 | 1547.0K tok/s | lr 8.52e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12700 | loss 2.8318 | ppl 17.0 | 1547.3K tok/s | lr 8.50e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12720 | loss 2.7685 | ppl 15.9 | 1548.0K tok/s | lr 8.47e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12740 | loss 2.8454 | ppl 17.2 | 1548.9K tok/s | lr 8.44e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12760 | loss 2.8221 | ppl 16.8 | 1548.5K tok/s | lr 8.42e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12780 | loss 2.7981 | ppl 16.4 | 1548.0K tok/s | lr 8.39e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12800 | loss 2.7956 | ppl 16.4 | 1547.6K tok/s | lr 8.36e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12820 | loss 2.8552 | ppl 17.4 | 1547.2K tok/s | lr 8.33e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12840 | loss 2.7781 | ppl 16.1 | 1547.1K tok/s | lr 8.31e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12860 | loss 2.8373 | ppl 17.1 | 1546.3K tok/s | lr 8.28e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12880 | loss 2.7982 | ppl 16.4 | 1546.3K tok/s | lr 8.25e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12900 | loss 2.8088 | ppl 16.6 | 1546.4K tok/s | lr 8.23e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12920 | loss 2.8450 | ppl 17.2 | 1546.2K tok/s | lr 8.20e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12940 | loss 2.8098 | ppl 16.6 | 1545.8K tok/s | lr 8.17e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 12960 | loss 2.8441 | ppl 17.2 | 1545.6K tok/s | lr 8.14e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 12980 | loss 2.8240 | ppl 16.8 | 1545.3K tok/s | lr 8.12e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13000 | loss 2.8199 | ppl 16.8 | 1545.9K tok/s | lr 8.09e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7572 | ppl 15.8 | assistant_tokens 10,736,711 ★ best\n", " step 13020 | loss 2.8484 | ppl 17.3 | 1252.4K tok/s | lr 8.06e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13040 | loss 2.7934 | ppl 16.3 | 1546.0K tok/s | lr 8.04e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13060 | loss 2.7926 | ppl 16.3 | 1546.9K tok/s | lr 8.01e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13080 | loss 2.7887 | ppl 16.3 | 1547.1K tok/s | lr 7.98e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13100 | loss 2.8449 | ppl 17.2 | 1546.0K tok/s | lr 7.96e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13120 | loss 2.7965 | ppl 16.4 | 1546.6K tok/s | lr 7.93e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13140 | loss 2.7816 | ppl 16.1 | 1546.5K tok/s | lr 7.90e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13160 | loss 2.8161 | ppl 16.7 | 1547.1K tok/s | lr 7.87e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13180 | loss 2.8380 | ppl 17.1 | 1547.4K tok/s | lr 7.85e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13200 | loss 2.7996 | ppl 16.4 | 1549.0K tok/s | lr 7.82e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13220 | loss 2.7979 | ppl 16.4 | 1548.8K tok/s | lr 7.79e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13240 | loss 2.8045 | ppl 16.5 | 1549.0K tok/s | lr 7.77e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13260 | loss 2.7938 | ppl 16.3 | 1548.6K tok/s | lr 7.74e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13280 | loss 2.8254 | ppl 16.9 | 1547.8K tok/s | lr 7.71e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13300 | loss 2.8146 | ppl 16.7 | 1547.2K tok/s | lr 7.69e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13320 | loss 2.7690 | ppl 15.9 | 1546.5K tok/s | lr 7.66e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13340 | loss 2.7440 | ppl 15.5 | 1546.2K tok/s | lr 7.63e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13360 | loss 2.8507 | ppl 17.3 | 1546.0K tok/s | lr 7.61e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13380 | loss 2.8209 | ppl 16.8 | 1546.5K tok/s | lr 7.58e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13400 | loss 2.8703 | ppl 17.6 | 1545.6K tok/s | lr 7.55e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13420 | loss 2.7991 | ppl 16.4 | 1545.1K tok/s | lr 7.53e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13440 | loss 2.8093 | ppl 16.6 | 1545.0K tok/s | lr 7.50e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13460 | loss 2.8119 | ppl 16.6 | 1545.6K tok/s | lr 7.47e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13480 | loss 2.7949 | ppl 16.4 | 1546.6K tok/s | lr 7.45e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13500 | loss 2.8732 | ppl 17.7 | 1546.7K tok/s | lr 7.42e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13520 | loss 2.7753 | ppl 16.0 | 1546.4K tok/s | lr 7.39e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13540 | loss 2.8328 | ppl 17.0 | 1546.4K tok/s | lr 7.37e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13560 | loss 2.8349 | ppl 17.0 | 1547.0K tok/s | lr 7.34e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13580 | loss 2.8363 | ppl 17.1 | 1547.0K tok/s | lr 7.31e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13600 | loss 2.7580 | ppl 15.8 | 1547.5K tok/s | lr 7.29e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13620 | loss 2.7866 | ppl 16.2 | 1547.2K tok/s | lr 7.26e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13640 | loss 2.8386 | ppl 17.1 | 1547.3K tok/s | lr 7.23e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13660 | loss 2.8181 | ppl 16.7 | 1548.1K tok/s | lr 7.21e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13680 | loss 2.8289 | ppl 16.9 | 1549.1K tok/s | lr 7.18e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13700 | loss 2.8331 | ppl 17.0 | 1548.5K tok/s | lr 7.15e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13720 | loss 2.8756 | ppl 17.7 | 1548.7K tok/s | lr 7.13e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13740 | loss 2.8106 | ppl 16.6 | 1548.5K tok/s | lr 7.10e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13760 | loss 2.7910 | ppl 16.3 | 1547.4K tok/s | lr 7.08e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13780 | loss 2.8272 | ppl 16.9 | 1546.5K tok/s | lr 7.05e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13800 | loss 2.8098 | ppl 16.6 | 1546.8K tok/s | lr 7.02e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13820 | loss 2.8024 | ppl 16.5 | 1546.3K tok/s | lr 7.00e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13840 | loss 2.8291 | ppl 16.9 | 1546.8K tok/s | lr 6.97e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13860 | loss 2.8165 | ppl 16.7 | 1546.8K tok/s | lr 6.94e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13880 | loss 2.7941 | ppl 16.3 | 1545.7K tok/s | lr 6.92e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13900 | loss 2.8519 | ppl 17.3 | 1545.8K tok/s | lr 6.89e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13920 | loss 2.8186 | ppl 16.8 | 1545.9K tok/s | lr 6.87e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13940 | loss 2.7824 | ppl 16.2 | 1546.8K tok/s | lr 6.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 13960 | loss 2.8060 | ppl 16.5 | 1546.6K tok/s | lr 6.81e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 13980 | loss 2.8177 | ppl 16.7 | 1547.0K tok/s | lr 6.79e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14000 | loss 2.8004 | ppl 16.5 | 1547.2K tok/s | lr 6.76e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7545 | ppl 15.7 | assistant_tokens 10,736,711 ★ best\n", " step 14020 | loss 2.7797 | ppl 16.1 | 1252.8K tok/s | lr 6.73e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14040 | loss 2.8445 | ppl 17.2 | 1547.7K tok/s | lr 6.71e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14060 | loss 2.7897 | ppl 16.3 | 1547.9K tok/s | lr 6.68e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14080 | loss 2.7795 | ppl 16.1 | 1547.9K tok/s | lr 6.66e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14100 | loss 2.8097 | ppl 16.6 | 1548.3K tok/s | lr 6.63e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14120 | loss 2.7513 | ppl 15.7 | 1549.4K tok/s | lr 6.60e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14140 | loss 2.7734 | ppl 16.0 | 1549.3K tok/s | lr 6.58e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14160 | loss 2.7914 | ppl 16.3 | 1549.0K tok/s | lr 6.55e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14180 | loss 2.8217 | ppl 16.8 | 1548.6K tok/s | lr 6.53e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14200 | loss 2.8529 | ppl 17.3 | 1549.5K tok/s | lr 6.50e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14220 | loss 2.7775 | ppl 16.1 | 1547.9K tok/s | lr 6.48e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14240 | loss 2.8129 | ppl 16.7 | 1547.7K tok/s | lr 6.45e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14260 | loss 2.8793 | ppl 17.8 | 1546.4K tok/s | lr 6.42e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14280 | loss 2.7950 | ppl 16.4 | 1546.6K tok/s | lr 6.40e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14300 | loss 2.8057 | ppl 16.5 | 1546.5K tok/s | lr 6.37e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14320 | loss 2.8497 | ppl 17.3 | 1547.0K tok/s | lr 6.35e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14340 | loss 2.7764 | ppl 16.1 | 1546.1K tok/s | lr 6.32e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14360 | loss 2.7704 | ppl 16.0 | 1546.8K tok/s | lr 6.30e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14380 | loss 2.7491 | ppl 15.6 | 1546.0K tok/s | lr 6.27e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14400 | loss 2.7639 | ppl 15.9 | 1546.5K tok/s | lr 6.24e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14420 | loss 2.8808 | ppl 17.8 | 1546.5K tok/s | lr 6.22e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14440 | loss 2.7740 | ppl 16.0 | 1546.9K tok/s | lr 6.19e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14460 | loss 2.8030 | ppl 16.5 | 1546.9K tok/s | lr 6.17e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14480 | loss 2.8198 | ppl 16.8 | 1546.2K tok/s | lr 6.14e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14500 | loss 2.8546 | ppl 17.4 | 1546.5K tok/s | lr 6.12e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14520 | loss 2.7826 | ppl 16.2 | 1547.5K tok/s | lr 6.09e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14540 | loss 2.8297 | ppl 16.9 | 1547.3K tok/s | lr 6.07e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14560 | loss 2.8455 | ppl 17.2 | 1548.1K tok/s | lr 6.04e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14580 | loss 2.7809 | ppl 16.1 | 1548.4K tok/s | lr 6.02e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14600 | loss 2.8057 | ppl 16.5 | 1548.5K tok/s | lr 5.99e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14620 | loss 2.8695 | ppl 17.6 | 1549.1K tok/s | lr 5.97e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14640 | loss 2.8145 | ppl 16.7 | 1549.1K tok/s | lr 5.94e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14660 | loss 2.7388 | ppl 15.5 | 1549.0K tok/s | lr 5.92e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14680 | loss 2.8326 | ppl 17.0 | 1548.4K tok/s | lr 5.89e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14700 | loss 2.8038 | ppl 16.5 | 1547.7K tok/s | lr 5.86e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14720 | loss 2.8218 | ppl 16.8 | 1547.0K tok/s | lr 5.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14740 | loss 2.7979 | ppl 16.4 | 1547.2K tok/s | lr 5.81e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14760 | loss 2.7969 | ppl 16.4 | 1546.0K tok/s | lr 5.79e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14780 | loss 2.8321 | ppl 17.0 | 1545.8K tok/s | lr 5.76e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14800 | loss 2.8125 | ppl 16.7 | 1545.4K tok/s | lr 5.74e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 14820 | loss 2.8124 | ppl 16.7 | 1546.4K tok/s | lr 5.71e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14840 | loss 2.8202 | ppl 16.8 | 1545.9K tok/s | lr 5.69e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14860 | loss 2.8500 | ppl 17.3 | 1546.1K tok/s | lr 5.67e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14880 | loss 2.7670 | ppl 15.9 | 1545.7K tok/s | lr 5.64e-04 | asst 8% | VRAM 31442MB (gpu0)\n", " step 14900 | loss 2.7708 | ppl 16.0 | 1546.7K tok/s | lr 5.62e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14920 | loss 2.8352 | ppl 17.0 | 1546.5K tok/s | lr 5.59e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14940 | loss 2.7971 | ppl 16.4 | 1546.4K tok/s | lr 5.57e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14960 | loss 2.7664 | ppl 15.9 | 1546.4K tok/s | lr 5.54e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 14980 | loss 2.8884 | ppl 18.0 | 1546.7K tok/s | lr 5.52e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15000 | loss 2.7986 | ppl 16.4 | 1547.4K tok/s | lr 5.49e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7515 | ppl 15.7 | assistant_tokens 10,736,711 ★ best\n", " step 15020 | loss 2.8519 | ppl 17.3 | 1253.1K tok/s | lr 5.47e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15040 | loss 2.7791 | ppl 16.1 | 1547.0K tok/s | lr 5.44e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15060 | loss 2.8113 | ppl 16.6 | 1547.1K tok/s | lr 5.42e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15080 | loss 2.8179 | ppl 16.7 | 1547.6K tok/s | lr 5.39e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15100 | loss 2.8352 | ppl 17.0 | 1549.0K tok/s | lr 5.37e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15120 | loss 2.8607 | ppl 17.5 | 1548.6K tok/s | lr 5.35e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15140 | loss 2.8280 | ppl 16.9 | 1548.9K tok/s | lr 5.32e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15160 | loss 2.8298 | ppl 16.9 | 1547.7K tok/s | lr 5.30e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15180 | loss 2.7799 | ppl 16.1 | 1546.9K tok/s | lr 5.27e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15200 | loss 2.7907 | ppl 16.3 | 1546.8K tok/s | lr 5.25e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15220 | loss 2.7936 | ppl 16.3 | 1546.4K tok/s | lr 5.22e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15240 | loss 2.8247 | ppl 16.9 | 1546.3K tok/s | lr 5.20e-04 | asst 5% | VRAM 31442MB (gpu0)\n", " step 15260 | loss 2.8042 | ppl 16.5 | 1546.0K tok/s | lr 5.18e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15280 | loss 2.8328 | ppl 17.0 | 1546.0K tok/s | lr 5.15e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15300 | loss 2.8392 | ppl 17.1 | 1546.2K tok/s | lr 5.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15320 | loss 2.7913 | ppl 16.3 | 1545.4K tok/s | lr 5.10e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15340 | loss 2.8007 | ppl 16.5 | 1545.8K tok/s | lr 5.08e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15360 | loss 2.8067 | ppl 16.6 | 1545.5K tok/s | lr 5.06e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15380 | loss 2.8303 | ppl 16.9 | 1545.7K tok/s | lr 5.03e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15400 | loss 2.8110 | ppl 16.6 | 1545.7K tok/s | lr 5.01e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15420 | loss 2.7618 | ppl 15.8 | 1546.1K tok/s | lr 4.98e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15440 | loss 2.8226 | ppl 16.8 | 1545.9K tok/s | lr 4.96e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15460 | loss 2.7968 | ppl 16.4 | 1546.3K tok/s | lr 4.94e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15480 | loss 2.8307 | ppl 17.0 | 1546.4K tok/s | lr 4.91e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15500 | loss 2.8034 | ppl 16.5 | 1546.0K tok/s | lr 4.89e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15520 | loss 2.8051 | ppl 16.5 | 1547.3K tok/s | lr 4.87e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15540 | loss 2.8101 | ppl 16.6 | 1547.3K tok/s | lr 4.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15560 | loss 2.8139 | ppl 16.7 | 1547.4K tok/s | lr 4.82e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15580 | loss 2.8647 | ppl 17.5 | 1547.3K tok/s | lr 4.79e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15600 | loss 2.8389 | ppl 17.1 | 1548.8K tok/s | lr 4.77e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15620 | loss 2.7977 | ppl 16.4 | 1547.9K tok/s | lr 4.75e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15640 | loss 2.8025 | ppl 16.5 | 1546.4K tok/s | lr 4.72e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15660 | loss 2.7926 | ppl 16.3 | 1546.0K tok/s | lr 4.70e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15680 | loss 2.8049 | ppl 16.5 | 1546.0K tok/s | lr 4.68e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15700 | loss 2.7876 | ppl 16.2 | 1545.9K tok/s | lr 4.65e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15720 | loss 2.8390 | ppl 17.1 | 1545.9K tok/s | lr 4.63e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15740 | loss 2.7785 | ppl 16.1 | 1545.9K tok/s | lr 4.61e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15760 | loss 2.8160 | ppl 16.7 | 1546.3K tok/s | lr 4.58e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15780 | loss 2.8321 | ppl 17.0 | 1546.7K tok/s | lr 4.56e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15800 | loss 2.7553 | ppl 15.7 | 1545.8K tok/s | lr 4.54e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15820 | loss 2.8305 | ppl 17.0 | 1546.5K tok/s | lr 4.51e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15840 | loss 2.8600 | ppl 17.5 | 1546.9K tok/s | lr 4.49e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15860 | loss 2.8382 | ppl 17.1 | 1546.3K tok/s | lr 4.47e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15880 | loss 2.7606 | ppl 15.8 | 1546.8K tok/s | lr 4.45e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15900 | loss 2.8062 | ppl 16.5 | 1546.5K tok/s | lr 4.42e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 15920 | loss 2.7717 | ppl 16.0 | 1547.2K tok/s | lr 4.40e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15940 | loss 2.8324 | ppl 17.0 | 1546.9K tok/s | lr 4.38e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15960 | loss 2.7568 | ppl 15.7 | 1547.7K tok/s | lr 4.35e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 15980 | loss 2.7780 | ppl 16.1 | 1546.6K tok/s | lr 4.33e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16000 | loss 2.7554 | ppl 15.7 | 1547.5K tok/s | lr 4.31e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7488 | ppl 15.6 | assistant_tokens 10,736,711 ★ best\n", " >>> saved checkpoints/sft/sft_step_0016000.pt\n", " step 16020 | loss 2.7931 | ppl 16.3 | 1250.2K tok/s | lr 4.29e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16040 | loss 2.8191 | ppl 16.8 | 1549.0K tok/s | lr 4.26e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16060 | loss 2.7888 | ppl 16.3 | 1548.6K tok/s | lr 4.24e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16080 | loss 2.8401 | ppl 17.1 | 1548.9K tok/s | lr 4.22e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16100 | loss 2.7920 | ppl 16.3 | 1548.8K tok/s | lr 4.20e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16120 | loss 2.7389 | ppl 15.5 | 1547.6K tok/s | lr 4.17e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16140 | loss 2.7709 | ppl 16.0 | 1547.0K tok/s | lr 4.15e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16160 | loss 2.7788 | ppl 16.1 | 1546.6K tok/s | lr 4.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16180 | loss 2.8326 | ppl 17.0 | 1546.2K tok/s | lr 4.11e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16200 | loss 2.8089 | ppl 16.6 | 1546.3K tok/s | lr 4.08e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16220 | loss 2.7954 | ppl 16.4 | 1546.4K tok/s | lr 4.06e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16240 | loss 2.8213 | ppl 16.8 | 1545.8K tok/s | lr 4.04e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16260 | loss 2.7940 | ppl 16.3 | 1545.8K tok/s | lr 4.02e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16280 | loss 2.7905 | ppl 16.3 | 1546.2K tok/s | lr 4.00e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16300 | loss 2.7884 | ppl 16.3 | 1545.3K tok/s | lr 3.97e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16320 | loss 2.7836 | ppl 16.2 | 1545.8K tok/s | lr 3.95e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16340 | loss 2.7420 | ppl 15.5 | 1546.2K tok/s | lr 3.93e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16360 | loss 2.7728 | ppl 16.0 | 1545.6K tok/s | lr 3.91e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16380 | loss 2.7858 | ppl 16.2 | 1546.3K tok/s | lr 3.89e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16400 | loss 2.8010 | ppl 16.5 | 1546.4K tok/s | lr 3.87e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16420 | loss 2.7785 | ppl 16.1 | 1546.3K tok/s | lr 3.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16440 | loss 2.7999 | ppl 16.4 | 1546.9K tok/s | lr 3.82e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16460 | loss 2.7992 | ppl 16.4 | 1546.4K tok/s | lr 3.80e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16480 | loss 2.7634 | ppl 15.9 | 1547.8K tok/s | lr 3.78e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16500 | loss 2.7644 | ppl 15.9 | 1547.8K tok/s | lr 3.76e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16520 | loss 2.7923 | ppl 16.3 | 1547.7K tok/s | lr 3.74e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16540 | loss 2.8066 | ppl 16.6 | 1548.9K tok/s | lr 3.71e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16560 | loss 2.7812 | ppl 16.1 | 1548.3K tok/s | lr 3.69e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16580 | loss 2.8103 | ppl 16.6 | 1548.1K tok/s | lr 3.67e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16600 | loss 2.7460 | ppl 15.6 | 1547.6K tok/s | lr 3.65e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16620 | loss 2.7579 | ppl 15.8 | 1546.9K tok/s | lr 3.63e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16640 | loss 2.7925 | ppl 16.3 | 1546.8K tok/s | lr 3.61e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16660 | loss 2.7712 | ppl 16.0 | 1546.1K tok/s | lr 3.59e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16680 | loss 2.7766 | ppl 16.1 | 1546.0K tok/s | lr 3.57e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16700 | loss 2.8238 | ppl 16.8 | 1546.6K tok/s | lr 3.54e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16720 | loss 2.7702 | ppl 16.0 | 1545.8K tok/s | lr 3.52e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16740 | loss 2.8077 | ppl 16.6 | 1545.4K tok/s | lr 3.50e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16760 | loss 2.7625 | ppl 15.8 | 1545.6K tok/s | lr 3.48e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16780 | loss 2.8667 | ppl 17.6 | 1545.4K tok/s | lr 3.46e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16800 | loss 2.7892 | ppl 16.3 | 1546.2K tok/s | lr 3.44e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16820 | loss 2.7537 | ppl 15.7 | 1545.8K tok/s | lr 3.42e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16840 | loss 2.8246 | ppl 16.9 | 1545.2K tok/s | lr 3.40e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16860 | loss 2.7644 | ppl 15.9 | 1546.2K tok/s | lr 3.38e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16880 | loss 2.7702 | ppl 16.0 | 1545.9K tok/s | lr 3.36e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16900 | loss 2.7686 | ppl 15.9 | 1546.0K tok/s | lr 3.34e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16920 | loss 2.7639 | ppl 15.9 | 1546.5K tok/s | lr 3.32e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16940 | loss 2.8240 | ppl 16.8 | 1547.0K tok/s | lr 3.30e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 16960 | loss 2.7705 | ppl 16.0 | 1547.5K tok/s | lr 3.27e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 16980 | loss 2.7735 | ppl 16.0 | 1547.6K tok/s | lr 3.25e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17000 | loss 2.8119 | ppl 16.6 | 1547.8K tok/s | lr 3.23e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7472 | ppl 15.6 | assistant_tokens 10,736,711 ★ best\n", " step 17020 | loss 2.8168 | ppl 16.7 | 1254.1K tok/s | lr 3.21e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17040 | loss 2.7685 | ppl 15.9 | 1547.9K tok/s | lr 3.19e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17060 | loss 2.8346 | ppl 17.0 | 1547.6K tok/s | lr 3.17e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17080 | loss 2.8163 | ppl 16.7 | 1546.8K tok/s | lr 3.15e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17100 | loss 2.7817 | ppl 16.1 | 1546.7K tok/s | lr 3.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17120 | loss 2.8522 | ppl 17.3 | 1545.6K tok/s | lr 3.11e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17140 | loss 2.8290 | ppl 16.9 | 1545.3K tok/s | lr 3.09e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17160 | loss 2.8533 | ppl 17.3 | 1545.5K tok/s | lr 3.07e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17180 | loss 2.7578 | ppl 15.8 | 1545.1K tok/s | lr 3.05e-04 | asst 8% | VRAM 31442MB (gpu0)\n", " step 17200 | loss 2.7787 | ppl 16.1 | 1544.7K tok/s | lr 3.03e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17220 | loss 2.8018 | ppl 16.5 | 1545.6K tok/s | lr 3.01e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17240 | loss 2.8306 | ppl 17.0 | 1545.0K tok/s | lr 2.99e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17260 | loss 2.8009 | ppl 16.5 | 1545.9K tok/s | lr 2.97e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17280 | loss 2.8191 | ppl 16.8 | 1545.9K tok/s | lr 2.96e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17300 | loss 2.7892 | ppl 16.3 | 1545.3K tok/s | lr 2.94e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17320 | loss 2.7915 | ppl 16.3 | 1545.9K tok/s | lr 2.92e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17340 | loss 2.7531 | ppl 15.7 | 1545.4K tok/s | lr 2.90e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17360 | loss 2.7661 | ppl 15.9 | 1545.7K tok/s | lr 2.88e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17380 | loss 2.8099 | ppl 16.6 | 1546.0K tok/s | lr 2.86e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17400 | loss 2.7772 | ppl 16.1 | 1545.6K tok/s | lr 2.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17420 | loss 2.7630 | ppl 15.8 | 1546.2K tok/s | lr 2.82e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17440 | loss 2.8524 | ppl 17.3 | 1547.5K tok/s | lr 2.80e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17460 | loss 2.7970 | ppl 16.4 | 1548.1K tok/s | lr 2.78e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17480 | loss 2.8110 | ppl 16.6 | 1548.3K tok/s | lr 2.76e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17500 | loss 2.7772 | ppl 16.1 | 1548.9K tok/s | lr 2.74e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17520 | loss 2.8363 | ppl 17.1 | 1548.2K tok/s | lr 2.72e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17540 | loss 2.7647 | ppl 15.9 | 1549.1K tok/s | lr 2.71e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17560 | loss 2.7534 | ppl 15.7 | 1547.5K tok/s | lr 2.69e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17580 | loss 2.8413 | ppl 17.1 | 1546.3K tok/s | lr 2.67e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17600 | loss 2.8258 | ppl 16.9 | 1547.3K tok/s | lr 2.65e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17620 | loss 2.8609 | ppl 17.5 | 1546.7K tok/s | lr 2.63e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17640 | loss 2.8079 | ppl 16.6 | 1546.5K tok/s | lr 2.61e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17660 | loss 2.8040 | ppl 16.5 | 1546.3K tok/s | lr 2.59e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17680 | loss 2.7653 | ppl 15.9 | 1545.7K tok/s | lr 2.58e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17700 | loss 2.7941 | ppl 16.3 | 1546.1K tok/s | lr 2.56e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17720 | loss 2.8485 | ppl 17.3 | 1545.5K tok/s | lr 2.54e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17740 | loss 2.7695 | ppl 16.0 | 1546.0K tok/s | lr 2.52e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17760 | loss 2.7857 | ppl 16.2 | 1545.9K tok/s | lr 2.50e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17780 | loss 2.7178 | ppl 15.1 | 1546.4K tok/s | lr 2.48e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17800 | loss 2.8460 | ppl 17.2 | 1546.5K tok/s | lr 2.47e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17820 | loss 2.7530 | ppl 15.7 | 1546.5K tok/s | lr 2.45e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17840 | loss 2.7558 | ppl 15.7 | 1546.6K tok/s | lr 2.43e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17860 | loss 2.7926 | ppl 16.3 | 1546.3K tok/s | lr 2.41e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17880 | loss 2.7836 | ppl 16.2 | 1547.5K tok/s | lr 2.39e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17900 | loss 2.8163 | ppl 16.7 | 1547.4K tok/s | lr 2.38e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17920 | loss 2.8139 | ppl 16.7 | 1547.5K tok/s | lr 2.36e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 17940 | loss 2.7524 | ppl 15.7 | 1548.4K tok/s | lr 2.34e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17960 | loss 2.8488 | ppl 17.3 | 1548.6K tok/s | lr 2.32e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 17980 | loss 2.8231 | ppl 16.8 | 1549.3K tok/s | lr 2.30e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18000 | loss 2.7997 | ppl 16.4 | 1547.8K tok/s | lr 2.29e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7455 | ppl 15.6 | assistant_tokens 10,736,711 ★ best\n", " step 18020 | loss 2.7955 | ppl 16.4 | 1253.7K tok/s | lr 2.27e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18040 | loss 2.8009 | ppl 16.5 | 1546.6K tok/s | lr 2.25e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18060 | loss 2.7862 | ppl 16.2 | 1546.0K tok/s | lr 2.23e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18080 | loss 2.7988 | ppl 16.4 | 1545.3K tok/s | lr 2.22e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18100 | loss 2.7843 | ppl 16.2 | 1546.1K tok/s | lr 2.20e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18120 | loss 2.8492 | ppl 17.3 | 1546.6K tok/s | lr 2.18e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18140 | loss 2.7681 | ppl 15.9 | 1546.1K tok/s | lr 2.17e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18160 | loss 2.7545 | ppl 15.7 | 1546.1K tok/s | lr 2.15e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18180 | loss 2.7671 | ppl 15.9 | 1546.6K tok/s | lr 2.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18200 | loss 2.7583 | ppl 15.8 | 1546.2K tok/s | lr 2.11e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18220 | loss 2.7919 | ppl 16.3 | 1546.0K tok/s | lr 2.10e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18240 | loss 2.7984 | ppl 16.4 | 1546.6K tok/s | lr 2.08e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18260 | loss 2.8030 | ppl 16.5 | 1547.0K tok/s | lr 2.06e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18280 | loss 2.8551 | ppl 17.4 | 1547.1K tok/s | lr 2.05e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18300 | loss 2.8095 | ppl 16.6 | 1547.0K tok/s | lr 2.03e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18320 | loss 2.7662 | ppl 15.9 | 1547.0K tok/s | lr 2.01e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18340 | loss 2.7425 | ppl 15.5 | 1547.3K tok/s | lr 2.00e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18360 | loss 2.8342 | ppl 17.0 | 1548.1K tok/s | lr 1.98e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18380 | loss 2.7508 | ppl 15.7 | 1548.3K tok/s | lr 1.96e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18400 | loss 2.8112 | ppl 16.6 | 1548.4K tok/s | lr 1.95e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18420 | loss 2.7580 | ppl 15.8 | 1548.9K tok/s | lr 1.93e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18440 | loss 2.8105 | ppl 16.6 | 1548.7K tok/s | lr 1.92e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18460 | loss 2.8215 | ppl 16.8 | 1548.8K tok/s | lr 1.90e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18480 | loss 2.7706 | ppl 16.0 | 1546.8K tok/s | lr 1.88e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18500 | loss 2.7761 | ppl 16.1 | 1547.0K tok/s | lr 1.87e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18520 | loss 2.7844 | ppl 16.2 | 1546.8K tok/s | lr 1.85e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18540 | loss 2.7774 | ppl 16.1 | 1546.4K tok/s | lr 1.84e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18560 | loss 2.7794 | ppl 16.1 | 1546.0K tok/s | lr 1.82e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18580 | loss 2.8179 | ppl 16.7 | 1545.9K tok/s | lr 1.80e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18600 | loss 2.7677 | ppl 15.9 | 1545.8K tok/s | lr 1.79e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18620 | loss 2.8498 | ppl 17.3 | 1545.7K tok/s | lr 1.77e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18640 | loss 2.7714 | ppl 16.0 | 1545.2K tok/s | lr 1.76e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18660 | loss 2.7575 | ppl 15.8 | 1545.0K tok/s | lr 1.74e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18680 | loss 2.8187 | ppl 16.8 | 1545.5K tok/s | lr 1.73e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18700 | loss 2.7520 | ppl 15.7 | 1545.8K tok/s | lr 1.71e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18720 | loss 2.7822 | ppl 16.2 | 1544.8K tok/s | lr 1.69e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18740 | loss 2.8475 | ppl 17.2 | 1545.8K tok/s | lr 1.68e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18760 | loss 2.7715 | ppl 16.0 | 1546.1K tok/s | lr 1.66e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18780 | loss 2.8563 | ppl 17.4 | 1546.8K tok/s | lr 1.65e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18800 | loss 2.7748 | ppl 16.0 | 1546.8K tok/s | lr 1.63e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18820 | loss 2.8054 | ppl 16.5 | 1546.8K tok/s | lr 1.62e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18840 | loss 2.7983 | ppl 16.4 | 1547.1K tok/s | lr 1.60e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18860 | loss 2.7969 | ppl 16.4 | 1547.7K tok/s | lr 1.59e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18880 | loss 2.8236 | ppl 16.8 | 1548.0K tok/s | lr 1.57e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18900 | loss 2.8163 | ppl 16.7 | 1547.5K tok/s | lr 1.56e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18920 | loss 2.8179 | ppl 16.7 | 1549.3K tok/s | lr 1.54e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18940 | loss 2.7898 | ppl 16.3 | 1548.9K tok/s | lr 1.53e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 18960 | loss 2.7341 | ppl 15.4 | 1547.3K tok/s | lr 1.52e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 18980 | loss 2.8430 | ppl 17.2 | 1547.1K tok/s | lr 1.50e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19000 | loss 2.7447 | ppl 15.6 | 1546.6K tok/s | lr 1.49e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7437 | ppl 15.5 | assistant_tokens 10,736,711 ★ best\n", " step 19020 | loss 2.8352 | ppl 17.0 | 1251.8K tok/s | lr 1.47e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19040 | loss 2.8066 | ppl 16.6 | 1545.8K tok/s | lr 1.46e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19060 | loss 2.7931 | ppl 16.3 | 1545.5K tok/s | lr 1.44e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19080 | loss 2.8041 | ppl 16.5 | 1545.8K tok/s | lr 1.43e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19100 | loss 2.7497 | ppl 15.6 | 1544.9K tok/s | lr 1.41e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19120 | loss 2.7779 | ppl 16.1 | 1545.6K tok/s | lr 1.40e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19140 | loss 2.7295 | ppl 15.3 | 1544.8K tok/s | lr 1.39e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19160 | loss 2.8025 | ppl 16.5 | 1545.8K tok/s | lr 1.37e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19180 | loss 2.7959 | ppl 16.4 | 1546.0K tok/s | lr 1.36e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19200 | loss 2.7827 | ppl 16.2 | 1545.9K tok/s | lr 1.35e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19220 | loss 2.7777 | ppl 16.1 | 1546.2K tok/s | lr 1.33e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19240 | loss 2.8266 | ppl 16.9 | 1546.7K tok/s | lr 1.32e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19260 | loss 2.7966 | ppl 16.4 | 1547.4K tok/s | lr 1.30e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19280 | loss 2.8364 | ppl 17.1 | 1547.5K tok/s | lr 1.29e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19300 | loss 2.8180 | ppl 16.7 | 1546.8K tok/s | lr 1.28e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19320 | loss 2.7722 | ppl 16.0 | 1548.4K tok/s | lr 1.26e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19340 | loss 2.8359 | ppl 17.0 | 1548.6K tok/s | lr 1.25e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19360 | loss 2.7861 | ppl 16.2 | 1549.5K tok/s | lr 1.24e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19380 | loss 2.8269 | ppl 16.9 | 1549.0K tok/s | lr 1.22e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19400 | loss 2.7944 | ppl 16.4 | 1549.0K tok/s | lr 1.21e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19420 | loss 2.7577 | ppl 15.8 | 1549.6K tok/s | lr 1.20e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19440 | loss 2.7727 | ppl 16.0 | 1548.7K tok/s | lr 1.18e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19460 | loss 2.8333 | ppl 17.0 | 1547.7K tok/s | lr 1.17e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19480 | loss 2.7910 | ppl 16.3 | 1547.6K tok/s | lr 1.16e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19500 | loss 2.8128 | ppl 16.7 | 1546.2K tok/s | lr 1.15e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19520 | loss 2.8179 | ppl 16.7 | 1546.5K tok/s | lr 1.13e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19540 | loss 2.8196 | ppl 16.8 | 1546.7K tok/s | lr 1.12e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19560 | loss 2.8289 | ppl 16.9 | 1545.6K tok/s | lr 1.11e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19580 | loss 2.7811 | ppl 16.1 | 1545.9K tok/s | lr 1.09e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19600 | loss 2.8014 | ppl 16.5 | 1546.5K tok/s | lr 1.08e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19620 | loss 2.8013 | ppl 16.5 | 1532.1K tok/s | lr 1.07e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19640 | loss 2.8730 | ppl 17.7 | 1546.2K tok/s | lr 1.06e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19660 | loss 2.8044 | ppl 16.5 | 1545.4K tok/s | lr 1.05e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19680 | loss 2.7768 | ppl 16.1 | 1546.8K tok/s | lr 1.03e-04 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19700 | loss 2.8283 | ppl 16.9 | 1546.0K tok/s | lr 1.02e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19720 | loss 2.8391 | ppl 17.1 | 1546.3K tok/s | lr 1.01e-04 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19740 | loss 2.8232 | ppl 16.8 | 1546.6K tok/s | lr 9.97e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19760 | loss 2.7940 | ppl 16.3 | 1502.2K tok/s | lr 9.85e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19780 | loss 2.8167 | ppl 16.7 | 1547.0K tok/s | lr 9.73e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19800 | loss 2.8578 | ppl 17.4 | 1547.1K tok/s | lr 9.61e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19820 | loss 2.8120 | ppl 16.6 | 1548.5K tok/s | lr 9.49e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19840 | loss 2.7861 | ppl 16.2 | 1549.5K tok/s | lr 9.38e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19860 | loss 2.7517 | ppl 15.7 | 1549.8K tok/s | lr 9.26e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19880 | loss 2.7873 | ppl 16.2 | 1520.9K tok/s | lr 9.14e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19900 | loss 2.7880 | ppl 16.2 | 1548.7K tok/s | lr 9.03e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19920 | loss 2.7725 | ppl 16.0 | 1546.9K tok/s | lr 8.92e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19940 | loss 2.7597 | ppl 15.8 | 1546.2K tok/s | lr 8.80e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 19960 | loss 2.8266 | ppl 16.9 | 1546.8K tok/s | lr 8.69e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 19980 | loss 2.7712 | ppl 16.0 | 1546.0K tok/s | lr 8.58e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20000 | loss 2.7605 | ppl 15.8 | 1546.0K tok/s | lr 8.47e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7424 | ppl 15.5 | assistant_tokens 10,736,711 ★ best\n", " >>> saved checkpoints/sft/sft_step_0020000.pt\n", " step 20020 | loss 2.8061 | ppl 16.5 | 1248.4K tok/s | lr 8.36e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20040 | loss 2.8080 | ppl 16.6 | 1545.7K tok/s | lr 8.25e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20060 | loss 2.7920 | ppl 16.3 | 1545.8K tok/s | lr 8.14e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20080 | loss 2.8989 | ppl 18.2 | 1545.8K tok/s | lr 8.03e-05 | asst 5% | VRAM 31442MB (gpu0)\n", " step 20100 | loss 2.7992 | ppl 16.4 | 1546.7K tok/s | lr 7.92e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20120 | loss 2.8535 | ppl 17.3 | 1545.7K tok/s | lr 7.81e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20140 | loss 2.8504 | ppl 17.3 | 1546.4K tok/s | lr 7.71e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20160 | loss 2.7717 | ppl 16.0 | 1546.6K tok/s | lr 7.60e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20180 | loss 2.7461 | ppl 15.6 | 1546.6K tok/s | lr 7.50e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20200 | loss 2.8372 | ppl 17.1 | 1545.9K tok/s | lr 7.39e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20220 | loss 2.8087 | ppl 16.6 | 1546.4K tok/s | lr 7.29e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20240 | loss 2.8338 | ppl 17.0 | 1547.0K tok/s | lr 7.19e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20260 | loss 2.8375 | ppl 17.1 | 1547.8K tok/s | lr 7.08e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20280 | loss 2.7651 | ppl 15.9 | 1547.7K tok/s | lr 6.98e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20300 | loss 2.7892 | ppl 16.3 | 1549.1K tok/s | lr 6.88e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20320 | loss 2.7851 | ppl 16.2 | 1549.0K tok/s | lr 6.78e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20340 | loss 2.7549 | ppl 15.7 | 1549.2K tok/s | lr 6.68e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20360 | loss 2.7838 | ppl 16.2 | 1548.1K tok/s | lr 6.58e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20380 | loss 2.7989 | ppl 16.4 | 1547.6K tok/s | lr 6.49e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20400 | loss 2.8553 | ppl 17.4 | 1546.6K tok/s | lr 6.39e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20420 | loss 2.7915 | ppl 16.3 | 1545.8K tok/s | lr 6.29e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20440 | loss 2.7921 | ppl 16.3 | 1545.8K tok/s | lr 6.20e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20460 | loss 2.8199 | ppl 16.8 | 1545.8K tok/s | lr 6.10e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20480 | loss 2.7615 | ppl 15.8 | 1545.9K tok/s | lr 6.01e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20500 | loss 2.7623 | ppl 15.8 | 1545.9K tok/s | lr 5.91e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20520 | loss 2.8053 | ppl 16.5 | 1545.6K tok/s | lr 5.82e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20540 | loss 2.8099 | ppl 16.6 | 1545.1K tok/s | lr 5.73e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20560 | loss 2.7571 | ppl 15.8 | 1545.4K tok/s | lr 5.64e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20580 | loss 2.7828 | ppl 16.2 | 1546.1K tok/s | lr 5.54e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20600 | loss 2.8416 | ppl 17.1 | 1546.5K tok/s | lr 5.45e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20620 | loss 2.7873 | ppl 16.2 | 1546.5K tok/s | lr 5.37e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20640 | loss 2.8250 | ppl 16.9 | 1546.6K tok/s | lr 5.28e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20660 | loss 2.7480 | ppl 15.6 | 1546.7K tok/s | lr 5.19e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20680 | loss 2.7496 | ppl 15.6 | 1546.3K tok/s | lr 5.10e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20700 | loss 2.7900 | ppl 16.3 | 1547.5K tok/s | lr 5.02e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20720 | loss 2.7828 | ppl 16.2 | 1547.5K tok/s | lr 4.93e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20740 | loss 2.7931 | ppl 16.3 | 1548.4K tok/s | lr 4.84e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20760 | loss 2.7637 | ppl 15.9 | 1549.0K tok/s | lr 4.76e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20780 | loss 2.7710 | ppl 16.0 | 1548.9K tok/s | lr 4.68e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20800 | loss 2.7500 | ppl 15.6 | 1549.3K tok/s | lr 4.59e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20820 | loss 2.7773 | ppl 16.1 | 1549.1K tok/s | lr 4.51e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20840 | loss 2.7884 | ppl 16.3 | 1548.2K tok/s | lr 4.43e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20860 | loss 2.7993 | ppl 16.4 | 1547.6K tok/s | lr 4.35e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20880 | loss 2.7701 | ppl 16.0 | 1545.9K tok/s | lr 4.27e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20900 | loss 2.8062 | ppl 16.5 | 1546.0K tok/s | lr 4.19e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 20920 | loss 2.7938 | ppl 16.3 | 1546.2K tok/s | lr 4.11e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20940 | loss 2.7904 | ppl 16.3 | 1545.3K tok/s | lr 4.03e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20960 | loss 2.7778 | ppl 16.1 | 1546.2K tok/s | lr 3.96e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 20980 | loss 2.8175 | ppl 16.7 | 1545.8K tok/s | lr 3.88e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21000 | loss 2.7673 | ppl 15.9 | 1546.8K tok/s | lr 3.80e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7417 | ppl 15.5 | assistant_tokens 10,736,711 ★ best\n", " step 21020 | loss 2.7834 | ppl 16.2 | 1252.2K tok/s | lr 3.73e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21040 | loss 2.7753 | ppl 16.0 | 1546.6K tok/s | lr 3.66e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21060 | loss 2.8302 | ppl 16.9 | 1546.5K tok/s | lr 3.58e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21080 | loss 2.7928 | ppl 16.3 | 1546.2K tok/s | lr 3.51e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21100 | loss 2.7684 | ppl 15.9 | 1547.2K tok/s | lr 3.44e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21120 | loss 2.7690 | ppl 15.9 | 1546.9K tok/s | lr 3.37e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21140 | loss 2.8001 | ppl 16.4 | 1547.7K tok/s | lr 3.30e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21160 | loss 2.7922 | ppl 16.3 | 1547.5K tok/s | lr 3.23e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21180 | loss 2.8303 | ppl 17.0 | 1547.1K tok/s | lr 3.16e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21200 | loss 2.7812 | ppl 16.1 | 1548.9K tok/s | lr 3.09e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21220 | loss 2.7674 | ppl 15.9 | 1549.5K tok/s | lr 3.02e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21240 | loss 2.7684 | ppl 15.9 | 1548.9K tok/s | lr 2.95e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21260 | loss 2.7989 | ppl 16.4 | 1550.2K tok/s | lr 2.89e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21280 | loss 2.8205 | ppl 16.8 | 1550.4K tok/s | lr 2.82e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21300 | loss 2.8217 | ppl 16.8 | 1550.1K tok/s | lr 2.76e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21320 | loss 2.8033 | ppl 16.5 | 1548.3K tok/s | lr 2.69e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21340 | loss 2.7504 | ppl 15.6 | 1547.9K tok/s | lr 2.63e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21360 | loss 2.8185 | ppl 16.8 | 1547.1K tok/s | lr 2.57e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21380 | loss 2.8342 | ppl 17.0 | 1546.5K tok/s | lr 2.51e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21400 | loss 2.8009 | ppl 16.5 | 1546.1K tok/s | lr 2.45e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21420 | loss 2.7514 | ppl 15.7 | 1546.1K tok/s | lr 2.39e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21440 | loss 2.7524 | ppl 15.7 | 1546.0K tok/s | lr 2.33e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21460 | loss 2.8116 | ppl 16.6 | 1546.3K tok/s | lr 2.27e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21480 | loss 2.8075 | ppl 16.6 | 1546.3K tok/s | lr 2.21e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21500 | loss 2.8288 | ppl 16.9 | 1545.9K tok/s | lr 2.15e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21520 | loss 2.7671 | ppl 15.9 | 1546.3K tok/s | lr 2.10e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21540 | loss 2.7982 | ppl 16.4 | 1546.7K tok/s | lr 2.04e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21560 | loss 2.8367 | ppl 17.1 | 1546.1K tok/s | lr 1.99e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21580 | loss 2.7770 | ppl 16.1 | 1546.9K tok/s | lr 1.93e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21600 | loss 2.7861 | ppl 16.2 | 1546.9K tok/s | lr 1.88e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21620 | loss 2.8137 | ppl 16.7 | 1547.3K tok/s | lr 1.82e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21640 | loss 2.7479 | ppl 15.6 | 1547.1K tok/s | lr 1.77e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21660 | loss 2.7899 | ppl 16.3 | 1548.2K tok/s | lr 1.72e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21680 | loss 2.8146 | ppl 16.7 | 1548.2K tok/s | lr 1.67e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21700 | loss 2.7987 | ppl 16.4 | 1548.4K tok/s | lr 1.62e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21720 | loss 2.7964 | ppl 16.4 | 1548.5K tok/s | lr 1.57e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21740 | loss 2.7793 | ppl 16.1 | 1549.1K tok/s | lr 1.52e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21760 | loss 2.7883 | ppl 16.3 | 1548.9K tok/s | lr 1.48e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21780 | loss 2.7803 | ppl 16.1 | 1548.5K tok/s | lr 1.43e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21800 | loss 2.8333 | ppl 17.0 | 1547.3K tok/s | lr 1.38e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21820 | loss 2.7495 | ppl 15.6 | 1547.3K tok/s | lr 1.34e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21840 | loss 2.8097 | ppl 16.6 | 1546.7K tok/s | lr 1.29e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21860 | loss 2.7884 | ppl 16.3 | 1546.8K tok/s | lr 1.25e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21880 | loss 2.8289 | ppl 16.9 | 1546.4K tok/s | lr 1.21e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21900 | loss 2.8029 | ppl 16.5 | 1546.4K tok/s | lr 1.16e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21920 | loss 2.8389 | ppl 17.1 | 1546.0K tok/s | lr 1.12e-05 | asst 6% | VRAM 31442MB (gpu0)\n", " step 21940 | loss 2.7648 | ppl 15.9 | 1545.4K tok/s | lr 1.08e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21960 | loss 2.7792 | ppl 16.1 | 1545.8K tok/s | lr 1.04e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 21980 | loss 2.7949 | ppl 16.4 | 1546.5K tok/s | lr 1.00e-05 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22000 | loss 2.8198 | ppl 16.8 | 1546.1K tok/s | lr 9.64e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7412 | ppl 15.5 | assistant_tokens 10,736,711 ★ best\n", " step 22020 | loss 2.7801 | ppl 16.1 | 1252.5K tok/s | lr 9.26e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22040 | loss 2.7954 | ppl 16.4 | 1546.2K tok/s | lr 8.89e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22060 | loss 2.8522 | ppl 17.3 | 1546.7K tok/s | lr 8.53e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22080 | loss 2.8251 | ppl 16.9 | 1547.3K tok/s | lr 8.18e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22100 | loss 2.7643 | ppl 15.9 | 1547.1K tok/s | lr 7.83e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22120 | loss 2.7809 | ppl 16.1 | 1547.7K tok/s | lr 7.49e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22140 | loss 2.7942 | ppl 16.3 | 1548.1K tok/s | lr 7.16e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22160 | loss 2.7744 | ppl 16.0 | 1548.6K tok/s | lr 6.83e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22180 | loss 2.8040 | ppl 16.5 | 1548.6K tok/s | lr 6.51e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22200 | loss 2.7789 | ppl 16.1 | 1549.5K tok/s | lr 6.20e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22220 | loss 2.7767 | ppl 16.1 | 1548.5K tok/s | lr 5.90e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22240 | loss 2.7642 | ppl 15.9 | 1549.4K tok/s | lr 5.60e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22260 | loss 2.8457 | ppl 17.2 | 1547.4K tok/s | lr 5.32e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22280 | loss 2.8027 | ppl 16.5 | 1547.3K tok/s | lr 5.04e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22300 | loss 2.7499 | ppl 15.6 | 1547.4K tok/s | lr 4.76e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22320 | loss 2.8332 | ppl 17.0 | 1546.4K tok/s | lr 4.50e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22340 | loss 2.8003 | ppl 16.4 | 1546.6K tok/s | lr 4.24e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22360 | loss 2.7616 | ppl 15.8 | 1546.1K tok/s | lr 3.99e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22380 | loss 2.7603 | ppl 15.8 | 1546.0K tok/s | lr 3.75e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22400 | loss 2.7444 | ppl 15.6 | 1546.3K tok/s | lr 3.52e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22420 | loss 2.7731 | ppl 16.0 | 1545.6K tok/s | lr 3.29e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22440 | loss 2.7836 | ppl 16.2 | 1546.6K tok/s | lr 3.07e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22460 | loss 2.7871 | ppl 16.2 | 1545.7K tok/s | lr 2.86e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22480 | loss 2.8267 | ppl 16.9 | 1545.3K tok/s | lr 2.65e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22500 | loss 2.8365 | ppl 17.1 | 1546.6K tok/s | lr 2.46e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22520 | loss 2.7665 | ppl 15.9 | 1545.7K tok/s | lr 2.27e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22540 | loss 2.7787 | ppl 16.1 | 1546.8K tok/s | lr 2.09e-06 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22560 | loss 2.8012 | ppl 16.5 | 1546.1K tok/s | lr 1.91e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22580 | loss 2.7521 | ppl 15.7 | 1547.0K tok/s | lr 1.75e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22600 | loss 2.7824 | ppl 16.2 | 1546.8K tok/s | lr 1.59e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22620 | loss 2.7567 | ppl 15.7 | 1546.9K tok/s | lr 1.44e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22640 | loss 2.8085 | ppl 16.6 | 1548.8K tok/s | lr 1.29e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22660 | loss 2.7880 | ppl 16.2 | 1547.4K tok/s | lr 1.16e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22680 | loss 2.8117 | ppl 16.6 | 1548.7K tok/s | lr 1.03e-06 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22700 | loss 2.8480 | ppl 17.3 | 1547.9K tok/s | lr 9.06e-07 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22720 | loss 2.7854 | ppl 16.2 | 1549.4K tok/s | lr 7.92e-07 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22740 | loss 2.7699 | ppl 16.0 | 1548.5K tok/s | lr 6.86e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22760 | loss 2.7577 | ppl 15.8 | 1548.0K tok/s | lr 5.88e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22780 | loss 2.7608 | ppl 15.8 | 1546.9K tok/s | lr 4.97e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22800 | loss 2.7631 | ppl 15.8 | 1546.7K tok/s | lr 4.14e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22820 | loss 2.8000 | ppl 16.4 | 1546.6K tok/s | lr 3.39e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22840 | loss 2.8283 | ppl 16.9 | 1546.3K tok/s | lr 2.71e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22860 | loss 2.7946 | ppl 16.4 | 1546.2K tok/s | lr 2.11e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22880 | loss 2.7627 | ppl 15.8 | 1545.8K tok/s | lr 1.58e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22900 | loss 2.7710 | ppl 16.0 | 1546.2K tok/s | lr 1.13e-07 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22920 | loss 2.7956 | ppl 16.4 | 1546.0K tok/s | lr 7.51e-08 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22940 | loss 2.8047 | ppl 16.5 | 1547.1K tok/s | lr 4.52e-08 | asst 6% | VRAM 31442MB (gpu0)\n", " step 22960 | loss 2.8014 | ppl 16.5 | 1546.8K tok/s | lr 2.28e-08 | asst 7% | VRAM 31442MB (gpu0)\n", " step 22980 | loss 2.8370 | ppl 17.1 | 1546.5K tok/s | lr 7.98e-09 | asst 6% | VRAM 31442MB (gpu0)\n", " step 23000 | loss 2.7796 | ppl 16.1 | 1546.6K tok/s | lr 7.68e-10 | asst 6% | VRAM 31442MB (gpu0)\n", " >>> eval loss 2.7412 | ppl 15.5 | assistant_tokens 10,736,711 ★ best\n", "\n", "Final eval: loss 2.7412 | ppl 15.5\n", "Final SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", "\n", "======================================================================\n", "SFT complete. 36,188,454,912 tokens processed.\n", "Best eval loss: 2.7412\n", "======================================================================\n" ] } ], "source": [ "!torchrun --nproc_per_node=8 -m freqformer.sft_train \\\n", " --preset small \\\n", " --distributed ddp \\\n", " --data_dir sft \\\n", " --pretrain_checkpoint checkpoints/pretrain/step_0007827.pt \\\n", " --batch_size 3 \\\n", " --seq_len 16384 \\\n", " --grad_accum_steps 4 \\\n", " --num_epochs 1 \\\n", " --warmup_steps 200 \\\n", " --optimizer muon \\\n", " --lr 0.07 \\\n", " --lr_schedule cosine \\\n", " --log_every 20 \\\n", " --eval_every 1000 \\\n", " --checkpoint_every 4000 \\\n", " --checkpoint_dir checkpoints/sft" ] }, { "cell_type": "code", "execution_count": 14, "id": "bb561b6c-566f-48f1-baa5-46eac13d1176", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "W0214 02:34:16.673000 43122 torch/distributed/run.py:774] \n", "W0214 02:34:16.673000 43122 torch/distributed/run.py:774] *****************************************\n", "W0214 02:34:16.673000 43122 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. \n", "W0214 02:34:16.673000 43122 torch/distributed/run.py:774] *****************************************\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "======================================================================\n", "FreqFormer DPO (8 GPUs, DDP)\n", "======================================================================\n", "DPO data vocab_size=32003, pad_id=32000\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", "Policy model: 33.2M parameters (vocab=32003)\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", "Loading SFT checkpoint: checkpoints/sft/sft_step_0023008.pt\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "Creating reference model (frozen copy)...\n", " Loaded SFT model: step 23008, best loss 2.74121767830018\n", "Compiling blocks with torch.compile (mode=default)...\n", "Policy wrapped with DDP (8 GPUs)\n", "DPO Train: 196,000 pairs, seq_len=16384\n", "DPO Val: 4,000 pairs, seq_len=16384\n", "Epoch-based training: 2 epochs x 3062 steps = 6124 total\n", "Optimizer: splus\n", " total: 33.2M\n", "\n", "DPO Training: steps 0→6124, B=2x8gpu, T=16384, GA=4, beta=0.3\n", "Effective batch: 2,097,152 tokens/step (chosen+rejected)\n", "Device: NVIDIA GeForce RTX 5090\n", "======================================================================\n", " step 10 | loss 0.8676 | acc 39% | margin -0.178 | 996.2K tok/s | lr 9.22e-05 | VRAM 29630MB (gpu0)\n", " step 20 | loss 0.7508 | acc 39% | margin -0.029 | 1184.8K tok/s | lr 1.95e-04 | VRAM 29630MB (gpu0)\n", " step 30 | loss 0.7099 | acc 56% | margin 0.048 | 1192.7K tok/s | lr 2.97e-04 | VRAM 29630MB (gpu0)\n", " step 40 | loss 0.8009 | acc 49% | margin -0.082 | 1192.7K tok/s | lr 3.99e-04 | VRAM 29630MB (gpu0)\n", " step 50 | loss 0.7292 | acc 51% | margin 0.004 | 1191.8K tok/s | lr 5.02e-04 | VRAM 29630MB (gpu0)\n", " step 60 | loss 0.7806 | acc 46% | margin -0.110 | 1192.3K tok/s | lr 6.04e-04 | VRAM 29630MB (gpu0)\n", " step 70 | loss 0.7461 | acc 46% | margin 0.001 | 1192.4K tok/s | lr 7.07e-04 | VRAM 29630MB (gpu0)\n", " step 80 | loss 0.7638 | acc 46% | margin -0.057 | 1192.9K tok/s | lr 8.09e-04 | VRAM 29630MB (gpu0)\n", " step 90 | loss 0.7708 | acc 42% | margin -0.069 | 1192.9K tok/s | lr 9.11e-04 | VRAM 29630MB (gpu0)\n", " step 100 | loss 0.7558 | acc 46% | margin -0.042 | 1136.6K tok/s | lr 1.01e-03 | VRAM 29630MB (gpu0)\n", " step 110 | loss 0.7612 | acc 55% | margin -0.022 | 1192.8K tok/s | lr 1.12e-03 | VRAM 29630MB (gpu0)\n", " step 120 | loss 0.7212 | acc 42% | margin 0.022 | 1193.0K tok/s | lr 1.22e-03 | VRAM 29630MB (gpu0)\n", " step 130 | loss 0.6950 | acc 57% | margin 0.076 | 1193.1K tok/s | lr 1.32e-03 | VRAM 29630MB (gpu0)\n", " step 140 | loss 0.7292 | acc 48% | margin 0.007 | 1193.2K tok/s | lr 1.42e-03 | VRAM 29630MB (gpu0)\n", " step 150 | loss 0.7204 | acc 55% | margin 0.030 | 1193.3K tok/s | lr 1.53e-03 | VRAM 29630MB (gpu0)\n", " step 160 | loss 0.7455 | acc 54% | margin -0.014 | 1193.2K tok/s | lr 1.63e-03 | VRAM 29630MB (gpu0)\n", " step 170 | loss 0.6852 | acc 65% | margin 0.110 | 1192.9K tok/s | lr 1.73e-03 | VRAM 29630MB (gpu0)\n", " step 180 | loss 0.6825 | acc 52% | margin 0.108 | 1193.0K tok/s | lr 1.83e-03 | VRAM 29630MB (gpu0)\n", " step 190 | loss 0.6449 | acc 55% | margin 0.193 | 1193.1K tok/s | lr 1.94e-03 | VRAM 29630MB (gpu0)\n", " step 200 | loss 0.7463 | acc 56% | margin 0.054 | 1137.1K tok/s | lr 2.04e-03 | VRAM 29630MB (gpu0)\n", " step 210 | loss 0.7144 | acc 61% | margin 0.022 | 1192.6K tok/s | lr 2.04e-03 | VRAM 29630MB (gpu0)\n", " step 220 | loss 0.7327 | acc 52% | margin 0.051 | 1192.8K tok/s | lr 2.04e-03 | VRAM 29630MB (gpu0)\n", " step 230 | loss 0.7185 | acc 60% | margin 0.029 | 1193.0K tok/s | lr 2.04e-03 | VRAM 29630MB (gpu0)\n", " step 240 | loss 0.7046 | acc 54% | margin 0.093 | 1193.2K tok/s | lr 2.03e-03 | VRAM 29630MB (gpu0)\n", " step 250 | loss 0.7292 | acc 56% | margin 0.053 | 1193.0K tok/s | lr 2.03e-03 | VRAM 29630MB (gpu0)\n", " step 260 | loss 0.7161 | acc 52% | margin 0.045 | 1192.9K tok/s | lr 2.03e-03 | VRAM 29630MB (gpu0)\n", " step 270 | loss 0.6701 | acc 54% | margin 0.167 | 1193.0K tok/s | lr 2.02e-03 | VRAM 29630MB (gpu0)\n", " step 280 | loss 0.6352 | acc 69% | margin 0.256 | 1192.9K tok/s | lr 2.02e-03 | VRAM 29630MB (gpu0)\n", " step 290 | loss 0.6334 | acc 68% | margin 0.250 | 1193.2K tok/s | lr 2.02e-03 | VRAM 29630MB (gpu0)\n", " step 300 | loss 0.6500 | acc 61% | margin 0.231 | 1138.4K tok/s | lr 2.01e-03 | VRAM 29630MB (gpu0)\n", " step 310 | loss 0.7357 | acc 50% | margin 0.053 | 1193.4K tok/s | lr 2.01e-03 | VRAM 29630MB (gpu0)\n", " step 320 | loss 0.6706 | acc 61% | margin 0.159 | 1193.1K tok/s | lr 2.01e-03 | VRAM 29630MB (gpu0)\n", " step 330 | loss 0.6552 | acc 64% | margin 0.149 | 1193.2K tok/s | lr 2.00e-03 | VRAM 29630MB (gpu0)\n", " step 340 | loss 0.6817 | acc 57% | margin 0.091 | 1193.6K tok/s | lr 2.00e-03 | VRAM 29630MB (gpu0)\n", " step 350 | loss 0.6014 | acc 64% | margin 0.315 | 1193.5K tok/s | lr 2.00e-03 | VRAM 29630MB (gpu0)\n", " step 360 | loss 0.6462 | acc 69% | margin 0.276 | 1193.6K tok/s | lr 1.99e-03 | VRAM 29630MB (gpu0)\n", " step 370 | loss 0.6162 | acc 61% | margin 0.302 | 1193.7K tok/s | lr 1.99e-03 | VRAM 29630MB (gpu0)\n", " step 380 | loss 0.5975 | acc 66% | margin 0.361 | 1193.4K tok/s | lr 1.99e-03 | VRAM 29630MB (gpu0)\n", " step 390 | loss 0.6786 | acc 61% | margin 0.182 | 1193.2K tok/s | lr 1.98e-03 | VRAM 29630MB (gpu0)\n", " step 400 | loss 0.6489 | acc 65% | margin 0.216 | 1138.6K tok/s | lr 1.98e-03 | VRAM 29630MB (gpu0)\n", " step 410 | loss 0.6096 | acc 61% | margin 0.290 | 1193.0K tok/s | lr 1.98e-03 | VRAM 29630MB (gpu0)\n", " step 420 | loss 0.6402 | acc 64% | margin 0.262 | 1193.0K tok/s | lr 1.97e-03 | VRAM 29630MB (gpu0)\n", " step 430 | loss 0.5755 | acc 65% | margin 0.407 | 1193.3K tok/s | lr 1.97e-03 | VRAM 29630MB (gpu0)\n", " step 440 | loss 0.6318 | acc 61% | margin 0.688 | 1192.9K tok/s | lr 1.97e-03 | VRAM 29630MB (gpu0)\n", " step 450 | loss 0.6044 | acc 64% | margin 0.328 | 1193.0K tok/s | lr 1.96e-03 | VRAM 29630MB (gpu0)\n", " step 460 | loss 0.6730 | acc 64% | margin 0.141 | 1193.2K tok/s | lr 1.96e-03 | VRAM 29630MB (gpu0)\n", " step 470 | loss 0.6581 | acc 68% | margin 0.201 | 1193.0K tok/s | lr 1.96e-03 | VRAM 29630MB (gpu0)\n", " step 480 | loss 0.6364 | acc 59% | margin 0.245 | 1192.8K tok/s | lr 1.95e-03 | VRAM 29630MB (gpu0)\n", " step 490 | loss 0.5706 | acc 76% | margin 0.422 | 1192.9K tok/s | lr 1.95e-03 | VRAM 29630MB (gpu0)\n", " step 500 | loss 0.7333 | acc 66% | margin 0.161 | 1138.7K tok/s | lr 1.94e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.6414 | acc 63% | margin 0.293 ★ best\n", " step 510 | loss 0.6634 | acc 61% | margin 0.182 | 426.3K tok/s | lr 1.94e-03 | VRAM 29630MB (gpu0)\n", " step 520 | loss 0.5643 | acc 79% | margin 0.365 | 1192.5K tok/s | lr 1.94e-03 | VRAM 29630MB (gpu0)\n", " step 530 | loss 0.5978 | acc 68% | margin 0.393 | 1192.5K tok/s | lr 1.93e-03 | VRAM 29630MB (gpu0)\n", " step 540 | loss 0.5982 | acc 62% | margin 0.419 | 1192.6K tok/s | lr 1.93e-03 | VRAM 29630MB (gpu0)\n", " step 550 | loss 0.6056 | acc 61% | margin 0.322 | 1192.8K tok/s | lr 1.93e-03 | VRAM 29630MB (gpu0)\n", " step 560 | loss 0.7217 | acc 64% | margin 0.211 | 1192.8K tok/s | lr 1.92e-03 | VRAM 29630MB (gpu0)\n", " step 570 | loss 0.7007 | acc 59% | margin 0.158 | 1192.7K tok/s | lr 1.92e-03 | VRAM 29630MB (gpu0)\n", " step 580 | loss 0.6196 | acc 60% | margin 0.267 | 1192.9K tok/s | lr 1.92e-03 | VRAM 29630MB (gpu0)\n", " step 590 | loss 0.6030 | acc 66% | margin 0.360 | 1192.8K tok/s | lr 1.91e-03 | VRAM 29630MB (gpu0)\n", " step 600 | loss 0.5939 | acc 72% | margin 0.359 | 1139.3K tok/s | lr 1.91e-03 | VRAM 29630MB (gpu0)\n", " step 610 | loss 0.6327 | acc 64% | margin 0.323 | 1193.3K tok/s | lr 1.91e-03 | VRAM 29630MB (gpu0)\n", " step 620 | loss 0.6304 | acc 70% | margin 0.323 | 1192.9K tok/s | lr 1.90e-03 | VRAM 29630MB (gpu0)\n", " step 630 | loss 0.6169 | acc 68% | margin 0.429 | 1192.7K tok/s | lr 1.90e-03 | VRAM 29630MB (gpu0)\n", " step 640 | loss 0.6549 | acc 62% | margin 0.217 | 1192.9K tok/s | lr 1.90e-03 | VRAM 29630MB (gpu0)\n", " step 650 | loss 0.5830 | acc 70% | margin 0.377 | 1192.8K tok/s | lr 1.89e-03 | VRAM 29630MB (gpu0)\n", " step 660 | loss 0.6532 | acc 61% | margin 0.251 | 1193.0K tok/s | lr 1.89e-03 | VRAM 29630MB (gpu0)\n", " step 670 | loss 0.5622 | acc 66% | margin 0.493 | 1192.9K tok/s | lr 1.89e-03 | VRAM 29630MB (gpu0)\n", " step 680 | loss 0.5929 | acc 65% | margin 0.452 | 1193.5K tok/s | lr 1.88e-03 | VRAM 29630MB (gpu0)\n", " step 690 | loss 0.5188 | acc 71% | margin 0.576 | 1193.3K tok/s | lr 1.88e-03 | VRAM 29630MB (gpu0)\n", " step 700 | loss 0.5847 | acc 72% | margin 0.362 | 1139.7K tok/s | lr 1.88e-03 | VRAM 29630MB (gpu0)\n", " step 710 | loss 0.5856 | acc 71% | margin 0.489 | 1194.2K tok/s | lr 1.87e-03 | VRAM 29630MB (gpu0)\n", " step 720 | loss 0.6226 | acc 65% | margin 0.369 | 1193.4K tok/s | lr 1.87e-03 | VRAM 29630MB (gpu0)\n", " step 730 | loss 0.6092 | acc 65% | margin 0.370 | 1193.5K tok/s | lr 1.87e-03 | VRAM 29630MB (gpu0)\n", " step 740 | loss 0.5656 | acc 70% | margin 0.467 | 1193.3K tok/s | lr 1.86e-03 | VRAM 29630MB (gpu0)\n", " step 750 | loss 0.6042 | acc 74% | margin 0.406 | 1193.4K tok/s | lr 1.86e-03 | VRAM 29630MB (gpu0)\n", " step 760 | loss 0.6493 | acc 62% | margin 0.215 | 1193.7K tok/s | lr 1.85e-03 | VRAM 29630MB (gpu0)\n", " step 770 | loss 0.7689 | acc 65% | margin 0.137 | 1193.2K tok/s | lr 1.85e-03 | VRAM 29630MB (gpu0)\n", " step 780 | loss 0.7754 | acc 62% | margin 0.044 | 1193.2K tok/s | lr 1.85e-03 | VRAM 29630MB (gpu0)\n", " step 790 | loss 0.5863 | acc 70% | margin 0.454 | 1192.7K tok/s | lr 1.84e-03 | VRAM 29630MB (gpu0)\n", " step 800 | loss 0.5349 | acc 75% | margin 0.601 | 1139.4K tok/s | lr 1.84e-03 | VRAM 29630MB (gpu0)\n", " step 810 | loss 0.5695 | acc 70% | margin 0.463 | 1192.8K tok/s | lr 1.84e-03 | VRAM 29630MB (gpu0)\n", " step 820 | loss 0.5794 | acc 72% | margin 0.479 | 1192.8K tok/s | lr 1.83e-03 | VRAM 29630MB (gpu0)\n", " step 830 | loss 0.5582 | acc 74% | margin 0.443 | 1192.9K tok/s | lr 1.83e-03 | VRAM 29630MB (gpu0)\n", " step 840 | loss 0.6890 | acc 64% | margin 0.243 | 1192.9K tok/s | lr 1.83e-03 | VRAM 29630MB (gpu0)\n", " step 850 | loss 0.6047 | acc 65% | margin 0.316 | 1193.2K tok/s | lr 1.82e-03 | VRAM 29630MB (gpu0)\n", " step 860 | loss 0.5745 | acc 72% | margin 0.525 | 1193.1K tok/s | lr 1.82e-03 | VRAM 29630MB (gpu0)\n", " step 870 | loss 0.5568 | acc 71% | margin 0.542 | 1193.0K tok/s | lr 1.82e-03 | VRAM 29630MB (gpu0)\n", " step 880 | loss 0.5795 | acc 71% | margin 0.400 | 1193.2K tok/s | lr 1.81e-03 | VRAM 29630MB (gpu0)\n", " step 890 | loss 0.5488 | acc 70% | margin 0.515 | 1193.2K tok/s | lr 1.81e-03 | VRAM 29630MB (gpu0)\n", " step 900 | loss 0.6244 | acc 68% | margin 0.499 | 1139.6K tok/s | lr 1.81e-03 | VRAM 29630MB (gpu0)\n", " step 910 | loss 0.5652 | acc 74% | margin 0.536 | 1193.4K tok/s | lr 1.80e-03 | VRAM 29630MB (gpu0)\n", " step 920 | loss 0.5926 | acc 66% | margin 0.441 | 1193.5K tok/s | lr 1.80e-03 | VRAM 29630MB (gpu0)\n", " step 930 | loss 0.6785 | acc 68% | margin 0.287 | 1193.6K tok/s | lr 1.80e-03 | VRAM 29630MB (gpu0)\n", " step 940 | loss 0.6113 | acc 65% | margin 0.460 | 1193.1K tok/s | lr 1.79e-03 | VRAM 29630MB (gpu0)\n", " step 950 | loss 0.6062 | acc 68% | margin 0.538 | 1193.4K tok/s | lr 1.79e-03 | VRAM 29630MB (gpu0)\n", " step 960 | loss 0.5746 | acc 75% | margin 0.482 | 1193.4K tok/s | lr 1.79e-03 | VRAM 29630MB (gpu0)\n", " step 970 | loss 0.5668 | acc 72% | margin 0.489 | 1193.5K tok/s | lr 1.78e-03 | VRAM 29630MB (gpu0)\n", " step 980 | loss 0.5837 | acc 69% | margin 0.523 | 1193.5K tok/s | lr 1.78e-03 | VRAM 29630MB (gpu0)\n", " step 990 | loss 0.8042 | acc 68% | margin 0.252 | 1193.8K tok/s | lr 1.78e-03 | VRAM 29630MB (gpu0)\n", " step 1000 | loss 0.5883 | acc 70% | margin 0.524 | 1140.1K tok/s | lr 1.77e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5860 | acc 68% | margin 0.521 ★ best\n", " step 1010 | loss 0.5660 | acc 69% | margin 0.434 | 430.8K tok/s | lr 1.77e-03 | VRAM 29630MB (gpu0)\n", " step 1020 | loss 0.6351 | acc 64% | margin 0.547 | 1192.9K tok/s | lr 1.76e-03 | VRAM 29630MB (gpu0)\n", " step 1030 | loss 0.5541 | acc 70% | margin 0.485 | 1192.6K tok/s | lr 1.76e-03 | VRAM 29630MB (gpu0)\n", " step 1040 | loss 0.5930 | acc 70% | margin 0.420 | 1192.6K tok/s | lr 1.76e-03 | VRAM 29630MB (gpu0)\n", " step 1050 | loss 0.5818 | acc 75% | margin 0.465 | 1192.5K tok/s | lr 1.75e-03 | VRAM 29630MB (gpu0)\n", " step 1060 | loss 0.5577 | acc 70% | margin 0.450 | 1192.5K tok/s | lr 1.75e-03 | VRAM 29630MB (gpu0)\n", " step 1070 | loss 0.5571 | acc 61% | margin 0.565 | 1192.5K tok/s | lr 1.75e-03 | VRAM 29630MB (gpu0)\n", " step 1080 | loss 0.6485 | acc 65% | margin 0.246 | 1192.6K tok/s | lr 1.74e-03 | VRAM 29630MB (gpu0)\n", " step 1090 | loss 0.5057 | acc 78% | margin 0.763 | 1192.2K tok/s | lr 1.74e-03 | VRAM 29630MB (gpu0)\n", " step 1100 | loss 0.4928 | acc 79% | margin 0.730 | 1139.1K tok/s | lr 1.74e-03 | VRAM 29630MB (gpu0)\n", " step 1110 | loss 0.5962 | acc 74% | margin 0.458 | 1193.0K tok/s | lr 1.73e-03 | VRAM 29630MB (gpu0)\n", " step 1120 | loss 0.8588 | acc 65% | margin 0.103 | 1192.9K tok/s | lr 1.73e-03 | VRAM 29630MB (gpu0)\n", " step 1130 | loss 0.5176 | acc 81% | margin 0.609 | 1192.8K tok/s | lr 1.73e-03 | VRAM 29630MB (gpu0)\n", " step 1140 | loss 0.5662 | acc 75% | margin 0.484 | 1192.7K tok/s | lr 1.72e-03 | VRAM 29630MB (gpu0)\n", " step 1150 | loss 0.4934 | acc 79% | margin 0.709 | 1192.9K tok/s | lr 1.72e-03 | VRAM 29630MB (gpu0)\n", " step 1160 | loss 0.5785 | acc 69% | margin 0.472 | 1193.2K tok/s | lr 1.72e-03 | VRAM 29630MB (gpu0)\n", " step 1170 | loss 0.6097 | acc 76% | margin 0.459 | 1193.2K tok/s | lr 1.71e-03 | VRAM 29630MB (gpu0)\n", " step 1180 | loss 0.6352 | acc 62% | margin 0.278 | 1193.2K tok/s | lr 1.71e-03 | VRAM 29630MB (gpu0)\n", " step 1190 | loss 0.5893 | acc 61% | margin 0.492 | 1193.7K tok/s | lr 1.71e-03 | VRAM 29630MB (gpu0)\n", " step 1200 | loss 0.5712 | acc 66% | margin 0.618 | 1140.3K tok/s | lr 1.70e-03 | VRAM 29630MB (gpu0)\n", " step 1210 | loss 0.6118 | acc 65% | margin 0.497 | 1193.8K tok/s | lr 1.70e-03 | VRAM 29630MB (gpu0)\n", " step 1220 | loss 0.6089 | acc 69% | margin 0.339 | 1193.7K tok/s | lr 1.70e-03 | VRAM 29630MB (gpu0)\n", " step 1230 | loss 0.5619 | acc 68% | margin 0.652 | 1193.2K tok/s | lr 1.69e-03 | VRAM 29630MB (gpu0)\n", " step 1240 | loss 0.6067 | acc 70% | margin 0.435 | 1193.4K tok/s | lr 1.69e-03 | VRAM 29630MB (gpu0)\n", " step 1250 | loss 0.5774 | acc 69% | margin 0.538 | 1193.3K tok/s | lr 1.69e-03 | VRAM 29630MB (gpu0)\n", " step 1260 | loss 0.6998 | acc 75% | margin 0.376 | 1193.2K tok/s | lr 1.68e-03 | VRAM 29630MB (gpu0)\n", " step 1270 | loss 0.5191 | acc 71% | margin 0.835 | 1193.5K tok/s | lr 1.68e-03 | VRAM 29630MB (gpu0)\n", " step 1280 | loss 0.5750 | acc 62% | margin 0.489 | 1193.0K tok/s | lr 1.67e-03 | VRAM 29630MB (gpu0)\n", " step 1290 | loss 0.5486 | acc 72% | margin 0.655 | 1192.7K tok/s | lr 1.67e-03 | VRAM 29630MB (gpu0)\n", " step 1300 | loss 0.5278 | acc 75% | margin 0.695 | 1139.9K tok/s | lr 1.67e-03 | VRAM 29630MB (gpu0)\n", " step 1310 | loss 0.5538 | acc 72% | margin 0.573 | 1192.8K tok/s | lr 1.66e-03 | VRAM 29630MB (gpu0)\n", " step 1320 | loss 0.5462 | acc 75% | margin 0.506 | 1193.0K tok/s | lr 1.66e-03 | VRAM 29630MB (gpu0)\n", " step 1330 | loss 0.5572 | acc 75% | margin 0.440 | 1192.9K tok/s | lr 1.66e-03 | VRAM 29630MB (gpu0)\n", " step 1340 | loss 0.7779 | acc 71% | margin 0.422 | 1192.8K tok/s | lr 1.65e-03 | VRAM 29630MB (gpu0)\n", " step 1350 | loss 0.5467 | acc 78% | margin 0.547 | 1193.0K tok/s | lr 1.65e-03 | VRAM 29630MB (gpu0)\n", " step 1360 | loss 0.5084 | acc 80% | margin 0.773 | 1192.8K tok/s | lr 1.65e-03 | VRAM 29630MB (gpu0)\n", " step 1370 | loss 0.6039 | acc 66% | margin 0.701 | 1192.7K tok/s | lr 1.64e-03 | VRAM 29630MB (gpu0)\n", " step 1380 | loss 0.5091 | acc 76% | margin 0.837 | 1192.7K tok/s | lr 1.64e-03 | VRAM 29630MB (gpu0)\n", " step 1390 | loss 0.5107 | acc 76% | margin 0.805 | 1193.0K tok/s | lr 1.64e-03 | VRAM 29630MB (gpu0)\n", " step 1400 | loss 0.5094 | acc 80% | margin 0.727 | 1139.6K tok/s | lr 1.63e-03 | VRAM 29630MB (gpu0)\n", " step 1410 | loss 0.4928 | acc 81% | margin 0.670 | 1192.9K tok/s | lr 1.63e-03 | VRAM 29630MB (gpu0)\n", " step 1420 | loss 0.5114 | acc 71% | margin 0.761 | 1193.0K tok/s | lr 1.63e-03 | VRAM 29630MB (gpu0)\n", " step 1430 | loss 0.5287 | acc 69% | margin 0.842 | 1192.8K tok/s | lr 1.62e-03 | VRAM 29630MB (gpu0)\n", " step 1440 | loss 0.5331 | acc 70% | margin 0.625 | 1193.0K tok/s | lr 1.62e-03 | VRAM 29630MB (gpu0)\n", " step 1450 | loss 0.4882 | acc 78% | margin 0.774 | 1193.1K tok/s | lr 1.62e-03 | VRAM 29630MB (gpu0)\n", " step 1460 | loss 0.5134 | acc 76% | margin 0.678 | 1193.2K tok/s | lr 1.61e-03 | VRAM 29630MB (gpu0)\n", " step 1470 | loss 0.4891 | acc 75% | margin 0.933 | 1193.3K tok/s | lr 1.61e-03 | VRAM 29630MB (gpu0)\n", " step 1480 | loss 0.5034 | acc 81% | margin 0.619 | 1193.2K tok/s | lr 1.61e-03 | VRAM 29630MB (gpu0)\n", " step 1490 | loss 0.5492 | acc 72% | margin 0.627 | 1193.8K tok/s | lr 1.60e-03 | VRAM 29630MB (gpu0)\n", " step 1500 | loss 0.5277 | acc 72% | margin 0.631 | 1140.3K tok/s | lr 1.60e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5500 | acc 72% | margin 0.652 ★ best\n", " step 1510 | loss 0.4858 | acc 80% | margin 0.725 | 432.2K tok/s | lr 1.60e-03 | VRAM 29630MB (gpu0)\n", " step 1520 | loss 0.5779 | acc 78% | margin 0.434 | 1194.0K tok/s | lr 1.59e-03 | VRAM 29630MB (gpu0)\n", " step 1530 | loss 0.5213 | acc 70% | margin 0.741 | 1193.4K tok/s | lr 1.59e-03 | VRAM 29630MB (gpu0)\n", " step 1540 | loss 0.5882 | acc 68% | margin 0.417 | 1193.3K tok/s | lr 1.59e-03 | VRAM 29630MB (gpu0)\n", " step 1550 | loss 0.4977 | acc 78% | margin 0.744 | 1193.3K tok/s | lr 1.58e-03 | VRAM 29630MB (gpu0)\n", " step 1560 | loss 0.4575 | acc 80% | margin 0.930 | 1193.1K tok/s | lr 1.58e-03 | VRAM 29630MB (gpu0)\n", " step 1570 | loss 0.5640 | acc 74% | margin 0.792 | 1193.2K tok/s | lr 1.57e-03 | VRAM 29630MB (gpu0)\n", " step 1580 | loss 0.5317 | acc 76% | margin 0.601 | 1193.2K tok/s | lr 1.57e-03 | VRAM 29630MB (gpu0)\n", " step 1590 | loss 0.5858 | acc 74% | margin 0.392 | 1193.3K tok/s | lr 1.57e-03 | VRAM 29630MB (gpu0)\n", " step 1600 | loss 0.5489 | acc 68% | margin 0.586 | 1140.1K tok/s | lr 1.56e-03 | VRAM 29630MB (gpu0)\n", " step 1610 | loss 0.5569 | acc 80% | margin 0.426 | 1193.2K tok/s | lr 1.56e-03 | VRAM 29630MB (gpu0)\n", " step 1620 | loss 0.5946 | acc 71% | margin 0.594 | 1192.9K tok/s | lr 1.56e-03 | VRAM 29630MB (gpu0)\n", " step 1630 | loss 0.6099 | acc 59% | margin 0.386 | 1193.1K tok/s | lr 1.55e-03 | VRAM 29630MB (gpu0)\n", " step 1640 | loss 0.5507 | acc 75% | margin 0.590 | 1192.9K tok/s | lr 1.55e-03 | VRAM 29630MB (gpu0)\n", " step 1650 | loss 0.4821 | acc 76% | margin 0.741 | 1192.8K tok/s | lr 1.55e-03 | VRAM 29630MB (gpu0)\n", " step 1660 | loss 0.5139 | acc 74% | margin 0.776 | 1192.9K tok/s | lr 1.54e-03 | VRAM 29630MB (gpu0)\n", " step 1670 | loss 0.4907 | acc 75% | margin 0.635 | 1192.7K tok/s | lr 1.54e-03 | VRAM 29630MB (gpu0)\n", " step 1680 | loss 0.4946 | acc 76% | margin 0.741 | 1192.7K tok/s | lr 1.54e-03 | VRAM 29630MB (gpu0)\n", " step 1690 | loss 0.5251 | acc 79% | margin 0.781 | 1193.0K tok/s | lr 1.53e-03 | VRAM 29630MB (gpu0)\n", " step 1700 | loss 0.5902 | acc 78% | margin 0.713 | 1139.5K tok/s | lr 1.53e-03 | VRAM 29630MB (gpu0)\n", " step 1710 | loss 0.5008 | acc 75% | margin 0.752 | 1193.2K tok/s | lr 1.53e-03 | VRAM 29630MB (gpu0)\n", " step 1720 | loss 0.5076 | acc 82% | margin 0.810 | 1192.9K tok/s | lr 1.52e-03 | VRAM 29630MB (gpu0)\n", " step 1730 | loss 0.4836 | acc 76% | margin 0.731 | 1193.4K tok/s | lr 1.52e-03 | VRAM 29630MB (gpu0)\n", " step 1740 | loss 0.5029 | acc 75% | margin 0.683 | 1193.5K tok/s | lr 1.52e-03 | VRAM 29630MB (gpu0)\n", " step 1750 | loss 0.4812 | acc 88% | margin 0.811 | 1193.2K tok/s | lr 1.51e-03 | VRAM 29630MB (gpu0)\n", " step 1760 | loss 0.5523 | acc 74% | margin 0.657 | 1193.4K tok/s | lr 1.51e-03 | VRAM 29630MB (gpu0)\n", " step 1770 | loss 0.4612 | acc 86% | margin 0.746 | 1193.3K tok/s | lr 1.51e-03 | VRAM 29630MB (gpu0)\n", " step 1780 | loss 0.5365 | acc 76% | margin 0.605 | 1193.7K tok/s | lr 1.50e-03 | VRAM 29630MB (gpu0)\n", " step 1790 | loss 0.4865 | acc 80% | margin 0.754 | 1193.1K tok/s | lr 1.50e-03 | VRAM 29630MB (gpu0)\n", " step 1800 | loss 0.4310 | acc 80% | margin 0.968 | 1139.8K tok/s | lr 1.50e-03 | VRAM 29630MB (gpu0)\n", " step 1810 | loss 0.5407 | acc 74% | margin 0.615 | 1192.9K tok/s | lr 1.49e-03 | VRAM 29630MB (gpu0)\n", " step 1820 | loss 0.5302 | acc 71% | margin 0.663 | 1192.7K tok/s | lr 1.49e-03 | VRAM 29630MB (gpu0)\n", " step 1830 | loss 0.4828 | acc 82% | margin 0.693 | 1192.6K tok/s | lr 1.48e-03 | VRAM 29630MB (gpu0)\n", " step 1840 | loss 0.5471 | acc 72% | margin 0.697 | 1193.1K tok/s | lr 1.48e-03 | VRAM 29630MB (gpu0)\n", " step 1850 | loss 0.5753 | acc 69% | margin 0.574 | 1192.8K tok/s | lr 1.48e-03 | VRAM 29630MB (gpu0)\n", " step 1860 | loss 0.5077 | acc 81% | margin 0.733 | 1192.8K tok/s | lr 1.47e-03 | VRAM 29630MB (gpu0)\n", " step 1870 | loss 0.5501 | acc 75% | margin 0.708 | 1192.9K tok/s | lr 1.47e-03 | VRAM 29630MB (gpu0)\n", " step 1880 | loss 0.4786 | acc 78% | margin 1.032 | 1193.1K tok/s | lr 1.47e-03 | VRAM 29630MB (gpu0)\n", " step 1890 | loss 0.5530 | acc 69% | margin 0.591 | 1193.2K tok/s | lr 1.46e-03 | VRAM 29630MB (gpu0)\n", " step 1900 | loss 0.4874 | acc 79% | margin 0.823 | 1139.8K tok/s | lr 1.46e-03 | VRAM 29630MB (gpu0)\n", " step 1910 | loss 0.4874 | acc 79% | margin 0.941 | 1193.1K tok/s | lr 1.46e-03 | VRAM 29630MB (gpu0)\n", " step 1920 | loss 0.4268 | acc 85% | margin 1.245 | 1193.0K tok/s | lr 1.45e-03 | VRAM 29630MB (gpu0)\n", " step 1930 | loss 0.5209 | acc 76% | margin 0.601 | 1193.0K tok/s | lr 1.45e-03 | VRAM 29630MB (gpu0)\n", " step 1940 | loss 0.5433 | acc 69% | margin 0.531 | 1192.7K tok/s | lr 1.45e-03 | VRAM 29630MB (gpu0)\n", " step 1950 | loss 0.4280 | acc 88% | margin 1.022 | 1192.9K tok/s | lr 1.44e-03 | VRAM 29630MB (gpu0)\n", " step 1960 | loss 0.5442 | acc 71% | margin 0.659 | 1193.5K tok/s | lr 1.44e-03 | VRAM 29630MB (gpu0)\n", " step 1970 | loss 0.4796 | acc 76% | margin 0.944 | 1193.4K tok/s | lr 1.44e-03 | VRAM 29630MB (gpu0)\n", " step 1980 | loss 0.5170 | acc 72% | margin 0.654 | 1193.4K tok/s | lr 1.43e-03 | VRAM 29630MB (gpu0)\n", " step 1990 | loss 0.4858 | acc 79% | margin 0.789 | 1193.6K tok/s | lr 1.43e-03 | VRAM 29630MB (gpu0)\n", " step 2000 | loss 0.5468 | acc 74% | margin 0.551 | 1140.7K tok/s | lr 1.43e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5227 | acc 74% | margin 0.782 ★ best\n", " >>> saved checkpoints/dpo/dpo_step_0002000.pt\n", " step 2010 | loss 0.5300 | acc 72% | margin 0.614 | 428.9K tok/s | lr 1.42e-03 | VRAM 29630MB (gpu0)\n", " step 2020 | loss 0.5681 | acc 72% | margin 0.580 | 1194.2K tok/s | lr 1.42e-03 | VRAM 29630MB (gpu0)\n", " step 2030 | loss 0.4784 | acc 76% | margin 0.900 | 1193.1K tok/s | lr 1.42e-03 | VRAM 29630MB (gpu0)\n", " step 2040 | loss 0.5366 | acc 68% | margin 0.745 | 1193.1K tok/s | lr 1.41e-03 | VRAM 29630MB (gpu0)\n", " step 2050 | loss 0.5731 | acc 69% | margin 0.724 | 1193.0K tok/s | lr 1.41e-03 | VRAM 29630MB (gpu0)\n", " step 2060 | loss 0.4950 | acc 80% | margin 0.669 | 1192.8K tok/s | lr 1.41e-03 | VRAM 29630MB (gpu0)\n", " step 2070 | loss 0.4759 | acc 80% | margin 0.856 | 1192.9K tok/s | lr 1.40e-03 | VRAM 29630MB (gpu0)\n", " step 2080 | loss 0.4480 | acc 80% | margin 0.879 | 1192.6K tok/s | lr 1.40e-03 | VRAM 29630MB (gpu0)\n", " step 2090 | loss 0.4839 | acc 76% | margin 1.117 | 1192.7K tok/s | lr 1.39e-03 | VRAM 29630MB (gpu0)\n", " step 2100 | loss 0.4818 | acc 78% | margin 0.722 | 1140.3K tok/s | lr 1.39e-03 | VRAM 29630MB (gpu0)\n", " step 2110 | loss 0.5295 | acc 75% | margin 0.581 | 1193.4K tok/s | lr 1.39e-03 | VRAM 29630MB (gpu0)\n", " step 2120 | loss 0.5442 | acc 80% | margin 0.604 | 1193.5K tok/s | lr 1.38e-03 | VRAM 29630MB (gpu0)\n", " step 2130 | loss 0.5372 | acc 69% | margin 0.547 | 1193.2K tok/s | lr 1.38e-03 | VRAM 29630MB (gpu0)\n", " step 2140 | loss 0.4607 | acc 81% | margin 1.075 | 1192.9K tok/s | lr 1.38e-03 | VRAM 29630MB (gpu0)\n", " step 2150 | loss 0.5132 | acc 80% | margin 0.763 | 1193.2K tok/s | lr 1.37e-03 | VRAM 29630MB (gpu0)\n", " step 2160 | loss 0.4912 | acc 81% | margin 0.851 | 1193.3K tok/s | lr 1.37e-03 | VRAM 29630MB (gpu0)\n", " step 2170 | loss 0.7600 | acc 80% | margin 0.395 | 1192.8K tok/s | lr 1.37e-03 | VRAM 29630MB (gpu0)\n", " step 2180 | loss 0.5199 | acc 78% | margin 0.663 | 1192.9K tok/s | lr 1.36e-03 | VRAM 29630MB (gpu0)\n", " step 2190 | loss 0.5517 | acc 76% | margin 0.609 | 1193.1K tok/s | lr 1.36e-03 | VRAM 29630MB (gpu0)\n", " step 2200 | loss 0.5933 | acc 70% | margin 0.413 | 1139.5K tok/s | lr 1.36e-03 | VRAM 29630MB (gpu0)\n", " step 2210 | loss 0.5236 | acc 81% | margin 0.610 | 1192.8K tok/s | lr 1.35e-03 | VRAM 29630MB (gpu0)\n", " step 2220 | loss 0.5072 | acc 85% | margin 0.755 | 1192.7K tok/s | lr 1.35e-03 | VRAM 29630MB (gpu0)\n", " step 2230 | loss 0.5694 | acc 84% | margin 0.636 | 1192.7K tok/s | lr 1.35e-03 | VRAM 29630MB (gpu0)\n", " step 2240 | loss 0.5443 | acc 80% | margin 0.759 | 1192.8K tok/s | lr 1.34e-03 | VRAM 29630MB (gpu0)\n", " step 2250 | loss 0.5458 | acc 72% | margin 0.635 | 1193.1K tok/s | lr 1.34e-03 | VRAM 29630MB (gpu0)\n", " step 2260 | loss 0.5380 | acc 72% | margin 0.624 | 1193.3K tok/s | lr 1.34e-03 | VRAM 29630MB (gpu0)\n", " step 2270 | loss 0.5502 | acc 76% | margin 0.565 | 1193.4K tok/s | lr 1.33e-03 | VRAM 29630MB (gpu0)\n", " step 2280 | loss 0.5141 | acc 78% | margin 0.791 | 1193.3K tok/s | lr 1.33e-03 | VRAM 29630MB (gpu0)\n", " step 2290 | loss 0.4797 | acc 84% | margin 0.857 | 1193.3K tok/s | lr 1.33e-03 | VRAM 29630MB (gpu0)\n", " step 2300 | loss 0.4795 | acc 78% | margin 0.829 | 1140.1K tok/s | lr 1.32e-03 | VRAM 29630MB (gpu0)\n", " step 2310 | loss 0.4832 | acc 80% | margin 0.824 | 1193.7K tok/s | lr 1.32e-03 | VRAM 29630MB (gpu0)\n", " step 2320 | loss 0.4965 | acc 80% | margin 1.013 | 1193.3K tok/s | lr 1.32e-03 | VRAM 29630MB (gpu0)\n", " step 2330 | loss 0.4686 | acc 85% | margin 1.166 | 1193.3K tok/s | lr 1.31e-03 | VRAM 29630MB (gpu0)\n", " step 2340 | loss 0.6010 | acc 78% | margin 0.672 | 1193.3K tok/s | lr 1.31e-03 | VRAM 29630MB (gpu0)\n", " step 2350 | loss 0.4874 | acc 78% | margin 0.816 | 1193.1K tok/s | lr 1.31e-03 | VRAM 29630MB (gpu0)\n", " step 2360 | loss 0.5526 | acc 78% | margin 0.720 | 1192.6K tok/s | lr 1.30e-03 | VRAM 29630MB (gpu0)\n", " step 2370 | loss 0.5260 | acc 75% | margin 0.766 | 1193.0K tok/s | lr 1.30e-03 | VRAM 29630MB (gpu0)\n", " step 2380 | loss 0.5399 | acc 80% | margin 0.656 | 1192.8K tok/s | lr 1.29e-03 | VRAM 29630MB (gpu0)\n", " step 2390 | loss 0.4496 | acc 84% | margin 0.927 | 1192.7K tok/s | lr 1.29e-03 | VRAM 29630MB (gpu0)\n", " step 2400 | loss 0.4893 | acc 82% | margin 0.903 | 1139.8K tok/s | lr 1.29e-03 | VRAM 29630MB (gpu0)\n", " step 2410 | loss 0.5453 | acc 71% | margin 0.794 | 1192.7K tok/s | lr 1.28e-03 | VRAM 29630MB (gpu0)\n", " step 2420 | loss 0.4781 | acc 82% | margin 0.974 | 1192.5K tok/s | lr 1.28e-03 | VRAM 29630MB (gpu0)\n", " step 2430 | loss 0.5011 | acc 74% | margin 0.773 | 1192.5K tok/s | lr 1.28e-03 | VRAM 29630MB (gpu0)\n", " step 2440 | loss 0.4959 | acc 75% | margin 0.712 | 1192.9K tok/s | lr 1.27e-03 | VRAM 29630MB (gpu0)\n", " step 2450 | loss 0.5196 | acc 74% | margin 0.660 | 1192.6K tok/s | lr 1.27e-03 | VRAM 29630MB (gpu0)\n", " step 2460 | loss 0.4599 | acc 79% | margin 0.930 | 1192.7K tok/s | lr 1.27e-03 | VRAM 29630MB (gpu0)\n", " step 2470 | loss 0.5189 | acc 76% | margin 0.718 | 1192.8K tok/s | lr 1.26e-03 | VRAM 29630MB (gpu0)\n", " step 2480 | loss 0.5201 | acc 74% | margin 0.711 | 1193.0K tok/s | lr 1.26e-03 | VRAM 29630MB (gpu0)\n", " step 2490 | loss 0.4363 | acc 84% | margin 0.943 | 1192.5K tok/s | lr 1.26e-03 | VRAM 29630MB (gpu0)\n", " step 2500 | loss 0.4588 | acc 84% | margin 0.933 | 1139.6K tok/s | lr 1.25e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5097 | acc 75% | margin 0.834 ★ best\n", " step 2510 | loss 0.4938 | acc 76% | margin 0.859 | 431.5K tok/s | lr 1.25e-03 | VRAM 29630MB (gpu0)\n", " step 2520 | loss 0.6595 | acc 72% | margin 0.473 | 1192.9K tok/s | lr 1.25e-03 | VRAM 29630MB (gpu0)\n", " step 2530 | loss 0.4873 | acc 75% | margin 1.004 | 1193.1K tok/s | lr 1.24e-03 | VRAM 29630MB (gpu0)\n", " step 2540 | loss 0.5205 | acc 76% | margin 0.750 | 1192.5K tok/s | lr 1.24e-03 | VRAM 29630MB (gpu0)\n", " step 2550 | loss 0.5154 | acc 81% | margin 0.676 | 1192.4K tok/s | lr 1.24e-03 | VRAM 29630MB (gpu0)\n", " step 2560 | loss 0.4586 | acc 82% | margin 0.997 | 1192.5K tok/s | lr 1.23e-03 | VRAM 29630MB (gpu0)\n", " step 2570 | loss 0.4590 | acc 79% | margin 1.168 | 1192.3K tok/s | lr 1.23e-03 | VRAM 29630MB (gpu0)\n", " step 2580 | loss 0.4493 | acc 82% | margin 0.888 | 1192.4K tok/s | lr 1.23e-03 | VRAM 29630MB (gpu0)\n", " step 2590 | loss 0.5146 | acc 76% | margin 0.781 | 1192.3K tok/s | lr 1.22e-03 | VRAM 29630MB (gpu0)\n", " step 2600 | loss 0.5012 | acc 76% | margin 0.642 | 1139.4K tok/s | lr 1.22e-03 | VRAM 29630MB (gpu0)\n", " step 2610 | loss 0.5973 | acc 78% | margin 0.811 | 1192.3K tok/s | lr 1.22e-03 | VRAM 29630MB (gpu0)\n", " step 2620 | loss 0.4499 | acc 80% | margin 1.071 | 1192.5K tok/s | lr 1.21e-03 | VRAM 29630MB (gpu0)\n", " step 2630 | loss 0.4946 | acc 76% | margin 0.898 | 1192.6K tok/s | lr 1.21e-03 | VRAM 29630MB (gpu0)\n", " step 2640 | loss 0.5474 | acc 72% | margin 0.743 | 1192.8K tok/s | lr 1.20e-03 | VRAM 29630MB (gpu0)\n", " step 2650 | loss 0.5020 | acc 71% | margin 0.741 | 1192.5K tok/s | lr 1.20e-03 | VRAM 29630MB (gpu0)\n", " step 2660 | loss 0.4681 | acc 81% | margin 0.899 | 1192.6K tok/s | lr 1.20e-03 | VRAM 29630MB (gpu0)\n", " step 2670 | loss 0.4534 | acc 81% | margin 0.938 | 1192.6K tok/s | lr 1.19e-03 | VRAM 29630MB (gpu0)\n", " step 2680 | loss 0.4324 | acc 88% | margin 0.931 | 1192.6K tok/s | lr 1.19e-03 | VRAM 29630MB (gpu0)\n", " step 2690 | loss 0.4818 | acc 80% | margin 0.802 | 1193.1K tok/s | lr 1.19e-03 | VRAM 29630MB (gpu0)\n", " step 2700 | loss 0.4546 | acc 79% | margin 1.120 | 1140.4K tok/s | lr 1.18e-03 | VRAM 29630MB (gpu0)\n", " step 2710 | loss 0.4619 | acc 82% | margin 0.842 | 1192.9K tok/s | lr 1.18e-03 | VRAM 29630MB (gpu0)\n", " step 2720 | loss 0.4919 | acc 71% | margin 0.938 | 1192.8K tok/s | lr 1.18e-03 | VRAM 29630MB (gpu0)\n", " step 2730 | loss 0.4955 | acc 79% | margin 0.998 | 1192.8K tok/s | lr 1.17e-03 | VRAM 29630MB (gpu0)\n", " step 2740 | loss 0.4931 | acc 76% | margin 0.808 | 1192.8K tok/s | lr 1.17e-03 | VRAM 29630MB (gpu0)\n", " step 2750 | loss 0.4618 | acc 79% | margin 0.976 | 1192.7K tok/s | lr 1.17e-03 | VRAM 29630MB (gpu0)\n", " step 2760 | loss 0.4355 | acc 79% | margin 1.148 | 1192.9K tok/s | lr 1.16e-03 | VRAM 29630MB (gpu0)\n", " step 2770 | loss 0.4918 | acc 78% | margin 0.834 | 1193.3K tok/s | lr 1.16e-03 | VRAM 29630MB (gpu0)\n", " step 2780 | loss 0.5002 | acc 79% | margin 0.988 | 1193.5K tok/s | lr 1.16e-03 | VRAM 29630MB (gpu0)\n", " step 2790 | loss 0.4671 | acc 85% | margin 0.875 | 1193.4K tok/s | lr 1.15e-03 | VRAM 29630MB (gpu0)\n", " step 2800 | loss 0.5605 | acc 74% | margin 0.626 | 1140.3K tok/s | lr 1.15e-03 | VRAM 29630MB (gpu0)\n", " step 2810 | loss 0.4521 | acc 80% | margin 0.876 | 1193.4K tok/s | lr 1.15e-03 | VRAM 29630MB (gpu0)\n", " step 2820 | loss 0.5311 | acc 82% | margin 0.768 | 1193.5K tok/s | lr 1.14e-03 | VRAM 29630MB (gpu0)\n", " step 2830 | loss 0.5703 | acc 75% | margin 0.631 | 1193.1K tok/s | lr 1.14e-03 | VRAM 29630MB (gpu0)\n", " step 2840 | loss 0.4923 | acc 79% | margin 0.824 | 1193.2K tok/s | lr 1.14e-03 | VRAM 29630MB (gpu0)\n", " step 2850 | loss 0.4962 | acc 81% | margin 0.703 | 1193.1K tok/s | lr 1.13e-03 | VRAM 29630MB (gpu0)\n", " step 2860 | loss 0.5008 | acc 79% | margin 0.603 | 1193.0K tok/s | lr 1.13e-03 | VRAM 29630MB (gpu0)\n", " step 2870 | loss 0.5355 | acc 69% | margin 0.860 | 1192.6K tok/s | lr 1.13e-03 | VRAM 29630MB (gpu0)\n", " step 2880 | loss 0.5087 | acc 80% | margin 0.726 | 1192.6K tok/s | lr 1.12e-03 | VRAM 29630MB (gpu0)\n", " step 2890 | loss 0.4400 | acc 84% | margin 0.816 | 1192.8K tok/s | lr 1.12e-03 | VRAM 29630MB (gpu0)\n", " step 2900 | loss 0.4823 | acc 82% | margin 0.853 | 1139.5K tok/s | lr 1.11e-03 | VRAM 29630MB (gpu0)\n", " step 2910 | loss 0.5375 | acc 81% | margin 0.775 | 1192.8K tok/s | lr 1.11e-03 | VRAM 29630MB (gpu0)\n", " step 2920 | loss 0.6571 | acc 74% | margin 0.625 | 1192.4K tok/s | lr 1.11e-03 | VRAM 29630MB (gpu0)\n", " step 2930 | loss 0.5121 | acc 85% | margin 1.025 | 1192.4K tok/s | lr 1.10e-03 | VRAM 29630MB (gpu0)\n", " step 2940 | loss 0.5444 | acc 72% | margin 0.609 | 1192.5K tok/s | lr 1.10e-03 | VRAM 29630MB (gpu0)\n", " step 2950 | loss 0.4807 | acc 79% | margin 0.885 | 1193.0K tok/s | lr 1.10e-03 | VRAM 29630MB (gpu0)\n", " step 2960 | loss 0.4668 | acc 80% | margin 0.850 | 1193.0K tok/s | lr 1.09e-03 | VRAM 29630MB (gpu0)\n", " step 2970 | loss 0.4671 | acc 85% | margin 0.865 | 1193.0K tok/s | lr 1.09e-03 | VRAM 29630MB (gpu0)\n", " step 2980 | loss 0.5268 | acc 75% | margin 0.697 | 1193.3K tok/s | lr 1.09e-03 | VRAM 29630MB (gpu0)\n", " step 2990 | loss 0.5383 | acc 76% | margin 0.548 | 1193.1K tok/s | lr 1.08e-03 | VRAM 29630MB (gpu0)\n", " step 3000 | loss 0.4677 | acc 80% | margin 1.047 | 1140.1K tok/s | lr 1.08e-03 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5017 | acc 76% | margin 0.885 ★ best\n", " step 3010 | loss 0.4953 | acc 76% | margin 0.959 | 433.4K tok/s | lr 1.08e-03 | VRAM 29630MB (gpu0)\n", " step 3020 | loss 0.5022 | acc 78% | margin 0.700 | 1192.9K tok/s | lr 1.07e-03 | VRAM 29630MB (gpu0)\n", " step 3030 | loss 0.4798 | acc 78% | margin 0.891 | 1193.1K tok/s | lr 1.07e-03 | VRAM 29630MB (gpu0)\n", " step 3040 | loss 0.4514 | acc 88% | margin 0.924 | 1193.0K tok/s | lr 1.07e-03 | VRAM 29630MB (gpu0)\n", " step 3050 | loss 0.5273 | acc 76% | margin 0.765 | 1193.0K tok/s | lr 1.06e-03 | VRAM 29630MB (gpu0)\n", " step 3060 | loss 0.4222 | acc 84% | margin 1.042 | 1193.0K tok/s | lr 1.06e-03 | VRAM 29630MB (gpu0)\n", " step 3070 | loss 0.3866 | acc 86% | margin 1.219 | 1191.1K tok/s | lr 1.06e-03 | VRAM 29630MB (gpu0)\n", " step 3080 | loss 0.4485 | acc 85% | margin 0.965 | 1192.9K tok/s | lr 1.05e-03 | VRAM 29630MB (gpu0)\n", " step 3090 | loss 0.4552 | acc 74% | margin 1.121 | 1192.9K tok/s | lr 1.05e-03 | VRAM 29630MB (gpu0)\n", " step 3100 | loss 0.5922 | acc 72% | margin 0.610 | 1139.6K tok/s | lr 1.05e-03 | VRAM 29630MB (gpu0)\n", " step 3110 | loss 0.5022 | acc 74% | margin 0.858 | 1192.5K tok/s | lr 1.04e-03 | VRAM 29630MB (gpu0)\n", " step 3120 | loss 0.4851 | acc 78% | margin 0.812 | 1192.7K tok/s | lr 1.04e-03 | VRAM 29630MB (gpu0)\n", " step 3130 | loss 0.5043 | acc 78% | margin 0.654 | 1192.7K tok/s | lr 1.04e-03 | VRAM 29630MB (gpu0)\n", " step 3140 | loss 0.5372 | acc 81% | margin 0.694 | 1192.6K tok/s | lr 1.03e-03 | VRAM 29630MB (gpu0)\n", " step 3150 | loss 0.4966 | acc 80% | margin 0.801 | 1192.8K tok/s | lr 1.03e-03 | VRAM 29630MB (gpu0)\n", " step 3160 | loss 0.5042 | acc 78% | margin 0.745 | 1192.7K tok/s | lr 1.03e-03 | VRAM 29630MB (gpu0)\n", " step 3170 | loss 0.5110 | acc 76% | margin 0.715 | 1192.7K tok/s | lr 1.02e-03 | VRAM 29630MB (gpu0)\n", " step 3180 | loss 0.4653 | acc 80% | margin 0.917 | 1192.3K tok/s | lr 1.02e-03 | VRAM 29630MB (gpu0)\n", " step 3190 | loss 0.4770 | acc 78% | margin 0.857 | 1192.6K tok/s | lr 1.01e-03 | VRAM 29630MB (gpu0)\n", " step 3200 | loss 0.4636 | acc 80% | margin 1.058 | 1139.5K tok/s | lr 1.01e-03 | VRAM 29630MB (gpu0)\n", " step 3210 | loss 0.4917 | acc 76% | margin 0.742 | 1192.7K tok/s | lr 1.01e-03 | VRAM 29630MB (gpu0)\n", " step 3220 | loss 0.5143 | acc 80% | margin 0.656 | 1192.5K tok/s | lr 1.00e-03 | VRAM 29630MB (gpu0)\n", " step 3230 | loss 0.7859 | acc 71% | margin 0.504 | 1192.8K tok/s | lr 1.00e-03 | VRAM 29630MB (gpu0)\n", " step 3240 | loss 0.4540 | acc 84% | margin 0.954 | 1192.9K tok/s | lr 9.97e-04 | VRAM 29630MB (gpu0)\n", " step 3250 | loss 0.4446 | acc 81% | margin 1.002 | 1192.7K tok/s | lr 9.94e-04 | VRAM 29630MB (gpu0)\n", " step 3260 | loss 0.5121 | acc 74% | margin 0.746 | 1192.7K tok/s | lr 9.90e-04 | VRAM 29630MB (gpu0)\n", " step 3270 | loss 0.6483 | acc 71% | margin 0.401 | 1192.6K tok/s | lr 9.87e-04 | VRAM 29630MB (gpu0)\n", " step 3280 | loss 0.6187 | acc 74% | margin 0.516 | 1192.6K tok/s | lr 9.84e-04 | VRAM 29630MB (gpu0)\n", " step 3290 | loss 0.6044 | acc 66% | margin 0.767 | 1192.6K tok/s | lr 9.80e-04 | VRAM 29630MB (gpu0)\n", " step 3300 | loss 0.4885 | acc 80% | margin 0.884 | 1139.9K tok/s | lr 9.77e-04 | VRAM 29630MB (gpu0)\n", " step 3310 | loss 0.4887 | acc 81% | margin 0.885 | 1192.8K tok/s | lr 9.73e-04 | VRAM 29630MB (gpu0)\n", " step 3320 | loss 0.5702 | acc 74% | margin 0.799 | 1193.1K tok/s | lr 9.70e-04 | VRAM 29630MB (gpu0)\n", " step 3330 | loss 0.4394 | acc 74% | margin 0.900 | 1192.8K tok/s | lr 9.66e-04 | VRAM 29630MB (gpu0)\n", " step 3340 | loss 0.4444 | acc 82% | margin 1.023 | 1192.9K tok/s | lr 9.63e-04 | VRAM 29630MB (gpu0)\n", " step 3350 | loss 0.5208 | acc 79% | margin 0.854 | 1193.0K tok/s | lr 9.59e-04 | VRAM 29630MB (gpu0)\n", " step 3360 | loss 0.5218 | acc 71% | margin 0.740 | 1193.1K tok/s | lr 9.56e-04 | VRAM 29630MB (gpu0)\n", " step 3370 | loss 0.5259 | acc 75% | margin 0.703 | 1192.8K tok/s | lr 9.52e-04 | VRAM 29630MB (gpu0)\n", " step 3380 | loss 0.4417 | acc 79% | margin 1.046 | 1192.7K tok/s | lr 9.49e-04 | VRAM 29630MB (gpu0)\n", " step 3390 | loss 0.4875 | acc 80% | margin 0.769 | 1192.2K tok/s | lr 9.46e-04 | VRAM 29630MB (gpu0)\n", " step 3400 | loss 0.5058 | acc 78% | margin 0.957 | 1139.6K tok/s | lr 9.42e-04 | VRAM 29630MB (gpu0)\n", " step 3410 | loss 0.4647 | acc 79% | margin 0.848 | 1192.7K tok/s | lr 9.39e-04 | VRAM 29630MB (gpu0)\n", " step 3420 | loss 0.4779 | acc 81% | margin 0.755 | 1192.4K tok/s | lr 9.35e-04 | VRAM 29630MB (gpu0)\n", " step 3430 | loss 0.5729 | acc 68% | margin 0.590 | 1192.7K tok/s | lr 9.32e-04 | VRAM 29630MB (gpu0)\n", " step 3440 | loss 0.4253 | acc 89% | margin 0.929 | 1192.5K tok/s | lr 9.28e-04 | VRAM 29630MB (gpu0)\n", " step 3450 | loss 0.4630 | acc 78% | margin 1.087 | 1192.5K tok/s | lr 9.25e-04 | VRAM 29630MB (gpu0)\n", " step 3460 | loss 0.4701 | acc 80% | margin 1.016 | 1192.8K tok/s | lr 9.21e-04 | VRAM 29630MB (gpu0)\n", " step 3470 | loss 0.4459 | acc 82% | margin 0.962 | 1192.6K tok/s | lr 9.18e-04 | VRAM 29630MB (gpu0)\n", " step 3480 | loss 0.4885 | acc 76% | margin 1.124 | 1192.2K tok/s | lr 9.14e-04 | VRAM 29630MB (gpu0)\n", " step 3490 | loss 0.4503 | acc 81% | margin 0.842 | 1192.7K tok/s | lr 9.11e-04 | VRAM 29630MB (gpu0)\n", " step 3500 | loss 0.4642 | acc 86% | margin 0.889 | 1139.3K tok/s | lr 9.07e-04 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.4996 | acc 75% | margin 0.917 ★ best\n", " step 3510 | loss 0.4676 | acc 79% | margin 0.938 | 436.1K tok/s | lr 9.04e-04 | VRAM 29630MB (gpu0)\n", " step 3520 | loss 0.5312 | acc 75% | margin 0.862 | 1193.1K tok/s | lr 9.01e-04 | VRAM 29630MB (gpu0)\n", " step 3530 | loss 0.4724 | acc 80% | margin 0.875 | 1193.4K tok/s | lr 8.97e-04 | VRAM 29630MB (gpu0)\n", " step 3540 | loss 0.4443 | acc 80% | margin 0.988 | 1193.0K tok/s | lr 8.94e-04 | VRAM 29630MB (gpu0)\n", " step 3550 | loss 0.4987 | acc 75% | margin 0.986 | 1193.1K tok/s | lr 8.90e-04 | VRAM 29630MB (gpu0)\n", " step 3560 | loss 0.4826 | acc 71% | margin 0.840 | 1193.0K tok/s | lr 8.87e-04 | VRAM 29630MB (gpu0)\n", " step 3570 | loss 0.5551 | acc 75% | margin 0.712 | 1192.8K tok/s | lr 8.83e-04 | VRAM 29630MB (gpu0)\n", " step 3580 | loss 0.4227 | acc 84% | margin 1.064 | 1193.2K tok/s | lr 8.80e-04 | VRAM 29630MB (gpu0)\n", " step 3590 | loss 0.5189 | acc 75% | margin 0.691 | 1192.9K tok/s | lr 8.76e-04 | VRAM 29630MB (gpu0)\n", " step 3600 | loss 0.4459 | acc 82% | margin 1.032 | 1139.7K tok/s | lr 8.73e-04 | VRAM 29630MB (gpu0)\n", " step 3610 | loss 0.5221 | acc 72% | margin 0.750 | 1193.2K tok/s | lr 8.69e-04 | VRAM 29630MB (gpu0)\n", " step 3620 | loss 0.4949 | acc 80% | margin 0.700 | 1193.3K tok/s | lr 8.66e-04 | VRAM 29630MB (gpu0)\n", " step 3630 | loss 0.4721 | acc 79% | margin 0.806 | 1193.2K tok/s | lr 8.63e-04 | VRAM 29630MB (gpu0)\n", " step 3640 | loss 0.4972 | acc 78% | margin 0.751 | 1193.1K tok/s | lr 8.59e-04 | VRAM 29630MB (gpu0)\n", " step 3650 | loss 0.5091 | acc 75% | margin 0.713 | 1193.0K tok/s | lr 8.56e-04 | VRAM 29630MB (gpu0)\n", " step 3660 | loss 0.4687 | acc 80% | margin 1.008 | 1193.2K tok/s | lr 8.52e-04 | VRAM 29630MB (gpu0)\n", " step 3670 | loss 0.4586 | acc 81% | margin 0.960 | 1192.8K tok/s | lr 8.49e-04 | VRAM 29630MB (gpu0)\n", " step 3680 | loss 0.4261 | acc 85% | margin 1.138 | 1192.8K tok/s | lr 8.45e-04 | VRAM 29630MB (gpu0)\n", " step 3690 | loss 0.4706 | acc 81% | margin 0.910 | 1191.7K tok/s | lr 8.42e-04 | VRAM 29630MB (gpu0)\n", " step 3700 | loss 0.4796 | acc 76% | margin 0.947 | 1139.8K tok/s | lr 8.38e-04 | VRAM 29630MB (gpu0)\n", " step 3710 | loss 0.4163 | acc 86% | margin 1.072 | 1192.7K tok/s | lr 8.35e-04 | VRAM 29630MB (gpu0)\n", " step 3720 | loss 0.4746 | acc 79% | margin 0.938 | 1193.0K tok/s | lr 8.31e-04 | VRAM 29630MB (gpu0)\n", " step 3730 | loss 0.5336 | acc 82% | margin 0.734 | 1193.0K tok/s | lr 8.28e-04 | VRAM 29630MB (gpu0)\n", " step 3740 | loss 0.5197 | acc 76% | margin 0.903 | 1193.0K tok/s | lr 8.25e-04 | VRAM 29630MB (gpu0)\n", " step 3750 | loss 0.4759 | acc 80% | margin 0.844 | 1192.8K tok/s | lr 8.21e-04 | VRAM 29630MB (gpu0)\n", " step 3760 | loss 0.4500 | acc 81% | margin 0.875 | 1192.9K tok/s | lr 8.18e-04 | VRAM 29630MB (gpu0)\n", " step 3770 | loss 0.4928 | acc 78% | margin 0.885 | 1192.7K tok/s | lr 8.14e-04 | VRAM 29630MB (gpu0)\n", " step 3780 | loss 0.4547 | acc 88% | margin 1.181 | 1192.5K tok/s | lr 8.11e-04 | VRAM 29630MB (gpu0)\n", " step 3790 | loss 0.4741 | acc 82% | margin 0.889 | 1193.2K tok/s | lr 8.07e-04 | VRAM 29630MB (gpu0)\n", " step 3800 | loss 0.4974 | acc 82% | margin 0.761 | 1140.1K tok/s | lr 8.04e-04 | VRAM 29630MB (gpu0)\n", " step 3810 | loss 0.4655 | acc 76% | margin 0.929 | 1193.2K tok/s | lr 8.00e-04 | VRAM 29630MB (gpu0)\n", " step 3820 | loss 0.6048 | acc 71% | margin 0.603 | 1193.3K tok/s | lr 7.97e-04 | VRAM 29630MB (gpu0)\n", " step 3830 | loss 0.4800 | acc 81% | margin 0.866 | 1193.3K tok/s | lr 7.93e-04 | VRAM 29630MB (gpu0)\n", " step 3840 | loss 0.4517 | acc 81% | margin 1.035 | 1193.3K tok/s | lr 7.90e-04 | VRAM 29630MB (gpu0)\n", " step 3850 | loss 0.5122 | acc 70% | margin 0.833 | 1193.4K tok/s | lr 7.86e-04 | VRAM 29630MB (gpu0)\n", " step 3860 | loss 0.4792 | acc 84% | margin 1.017 | 1193.3K tok/s | lr 7.83e-04 | VRAM 29630MB (gpu0)\n", " step 3870 | loss 0.5088 | acc 79% | margin 0.825 | 1193.3K tok/s | lr 7.80e-04 | VRAM 29630MB (gpu0)\n", " step 3880 | loss 0.5579 | acc 72% | margin 0.792 | 1193.0K tok/s | lr 7.76e-04 | VRAM 29630MB (gpu0)\n", " step 3890 | loss 0.5106 | acc 81% | margin 0.701 | 1193.2K tok/s | lr 7.73e-04 | VRAM 29630MB (gpu0)\n", " step 3900 | loss 0.5178 | acc 78% | margin 0.998 | 1139.7K tok/s | lr 7.69e-04 | VRAM 29630MB (gpu0)\n", " step 3910 | loss 0.5020 | acc 78% | margin 0.836 | 1192.9K tok/s | lr 7.66e-04 | VRAM 29630MB (gpu0)\n", " step 3920 | loss 0.5237 | acc 76% | margin 0.762 | 1192.9K tok/s | lr 7.62e-04 | VRAM 29630MB (gpu0)\n", " step 3930 | loss 0.5010 | acc 82% | margin 0.826 | 1192.5K tok/s | lr 7.59e-04 | VRAM 29630MB (gpu0)\n", " step 3940 | loss 0.4773 | acc 76% | margin 0.940 | 1192.8K tok/s | lr 7.55e-04 | VRAM 29630MB (gpu0)\n", " step 3950 | loss 0.4557 | acc 80% | margin 0.977 | 1192.7K tok/s | lr 7.52e-04 | VRAM 29630MB (gpu0)\n", " step 3960 | loss 0.5922 | acc 64% | margin 0.501 | 1193.0K tok/s | lr 7.48e-04 | VRAM 29630MB (gpu0)\n", " step 3970 | loss 0.4017 | acc 88% | margin 1.428 | 1192.8K tok/s | lr 7.45e-04 | VRAM 29630MB (gpu0)\n", " step 3980 | loss 0.5293 | acc 75% | margin 0.637 | 1192.7K tok/s | lr 7.42e-04 | VRAM 29630MB (gpu0)\n", " step 3990 | loss 0.5892 | acc 76% | margin 0.830 | 1192.4K tok/s | lr 7.38e-04 | VRAM 29630MB (gpu0)\n", " step 4000 | loss 0.4411 | acc 88% | margin 0.915 | 1139.7K tok/s | lr 7.35e-04 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.4988 | acc 74% | margin 0.896 ★ best\n", " >>> saved checkpoints/dpo/dpo_step_0004000.pt\n", " step 4010 | loss 0.4256 | acc 86% | margin 1.102 | 432.7K tok/s | lr 7.31e-04 | VRAM 29630MB (gpu0)\n", " step 4020 | loss 0.5406 | acc 68% | margin 0.783 | 1192.6K tok/s | lr 7.28e-04 | VRAM 29630MB (gpu0)\n", " step 4030 | loss 0.5005 | acc 76% | margin 0.742 | 1192.6K tok/s | lr 7.24e-04 | VRAM 29630MB (gpu0)\n", " step 4040 | loss 0.4575 | acc 79% | margin 0.925 | 1192.7K tok/s | lr 7.21e-04 | VRAM 29630MB (gpu0)\n", " step 4050 | loss 0.5022 | acc 85% | margin 0.855 | 1192.7K tok/s | lr 7.17e-04 | VRAM 29630MB (gpu0)\n", " step 4060 | loss 0.5225 | acc 81% | margin 0.933 | 1192.7K tok/s | lr 7.14e-04 | VRAM 29630MB (gpu0)\n", " step 4070 | loss 0.5923 | acc 75% | margin 0.583 | 1192.6K tok/s | lr 7.10e-04 | VRAM 29630MB (gpu0)\n", " step 4080 | loss 0.4454 | acc 81% | margin 1.406 | 1192.8K tok/s | lr 7.07e-04 | VRAM 29630MB (gpu0)\n", " step 4090 | loss 0.5202 | acc 76% | margin 0.936 | 1192.8K tok/s | lr 7.04e-04 | VRAM 29630MB (gpu0)\n", " step 4100 | loss 0.4382 | acc 85% | margin 0.907 | 1140.0K tok/s | lr 7.00e-04 | VRAM 29630MB (gpu0)\n", " step 4110 | loss 0.4811 | acc 80% | margin 0.922 | 1193.3K tok/s | lr 6.97e-04 | VRAM 29630MB (gpu0)\n", " step 4120 | loss 0.4471 | acc 86% | margin 0.972 | 1193.2K tok/s | lr 6.93e-04 | VRAM 29630MB (gpu0)\n", " step 4130 | loss 0.4756 | acc 75% | margin 1.019 | 1193.4K tok/s | lr 6.90e-04 | VRAM 29630MB (gpu0)\n", " step 4140 | loss 0.4936 | acc 72% | margin 1.142 | 1193.4K tok/s | lr 6.86e-04 | VRAM 29630MB (gpu0)\n", " step 4150 | loss 0.4845 | acc 76% | margin 0.910 | 1193.6K tok/s | lr 6.83e-04 | VRAM 29630MB (gpu0)\n", " step 4160 | loss 0.6253 | acc 62% | margin 0.666 | 1193.4K tok/s | lr 6.79e-04 | VRAM 29630MB (gpu0)\n", " step 4170 | loss 0.4541 | acc 86% | margin 1.133 | 1193.1K tok/s | lr 6.76e-04 | VRAM 29630MB (gpu0)\n", " step 4180 | loss 0.5024 | acc 81% | margin 0.770 | 1193.1K tok/s | lr 6.72e-04 | VRAM 29630MB (gpu0)\n", " step 4190 | loss 0.5429 | acc 74% | margin 0.736 | 1193.3K tok/s | lr 6.69e-04 | VRAM 29630MB (gpu0)\n", " step 4200 | loss 0.4495 | acc 82% | margin 0.916 | 1140.1K tok/s | lr 6.65e-04 | VRAM 29630MB (gpu0)\n", " step 4210 | loss 0.4642 | acc 84% | margin 0.949 | 1193.1K tok/s | lr 6.62e-04 | VRAM 29630MB (gpu0)\n", " step 4220 | loss 0.4427 | acc 80% | margin 0.901 | 1193.0K tok/s | lr 6.59e-04 | VRAM 29630MB (gpu0)\n", " step 4230 | loss 0.4519 | acc 79% | margin 1.012 | 1193.6K tok/s | lr 6.55e-04 | VRAM 29630MB (gpu0)\n", " step 4240 | loss 0.4835 | acc 80% | margin 0.974 | 1192.7K tok/s | lr 6.52e-04 | VRAM 29630MB (gpu0)\n", " step 4250 | loss 0.4575 | acc 81% | margin 0.851 | 1192.5K tok/s | lr 6.48e-04 | VRAM 29630MB (gpu0)\n", " step 4260 | loss 0.4354 | acc 85% | margin 1.062 | 1192.8K tok/s | lr 6.45e-04 | VRAM 29630MB (gpu0)\n", " step 4270 | loss 0.5575 | acc 76% | margin 0.676 | 1192.6K tok/s | lr 6.41e-04 | VRAM 29630MB (gpu0)\n", " step 4280 | loss 0.5541 | acc 81% | margin 0.838 | 1192.7K tok/s | lr 6.38e-04 | VRAM 29630MB (gpu0)\n", " step 4290 | loss 0.4862 | acc 89% | margin 0.910 | 1192.8K tok/s | lr 6.34e-04 | VRAM 29630MB (gpu0)\n", " step 4300 | loss 0.4742 | acc 76% | margin 0.924 | 1139.8K tok/s | lr 6.31e-04 | VRAM 29630MB (gpu0)\n", " step 4310 | loss 0.4634 | acc 80% | margin 1.137 | 1193.0K tok/s | lr 6.27e-04 | VRAM 29630MB (gpu0)\n", " step 4320 | loss 0.4341 | acc 86% | margin 0.956 | 1193.2K tok/s | lr 6.24e-04 | VRAM 29630MB (gpu0)\n", " step 4330 | loss 0.4634 | acc 79% | margin 1.084 | 1192.9K tok/s | lr 6.21e-04 | VRAM 29630MB (gpu0)\n", " step 4340 | loss 0.5020 | acc 80% | margin 0.657 | 1192.8K tok/s | lr 6.17e-04 | VRAM 29630MB (gpu0)\n", " step 4350 | loss 0.4655 | acc 79% | margin 0.895 | 1192.9K tok/s | lr 6.14e-04 | VRAM 29630MB (gpu0)\n", " step 4360 | loss 0.4695 | acc 76% | margin 1.018 | 1192.8K tok/s | lr 6.10e-04 | VRAM 29630MB (gpu0)\n", " step 4370 | loss 0.4303 | acc 79% | margin 1.159 | 1191.6K tok/s | lr 6.07e-04 | VRAM 29630MB (gpu0)\n", " step 4380 | loss 0.5258 | acc 72% | margin 0.870 | 1192.9K tok/s | lr 6.03e-04 | VRAM 29630MB (gpu0)\n", " step 4390 | loss 0.4832 | acc 74% | margin 0.981 | 1193.0K tok/s | lr 6.00e-04 | VRAM 29630MB (gpu0)\n", " step 4400 | loss 0.4605 | acc 75% | margin 1.032 | 1140.0K tok/s | lr 5.96e-04 | VRAM 29630MB (gpu0)\n", " step 4410 | loss 0.4946 | acc 74% | margin 1.194 | 1192.9K tok/s | lr 5.93e-04 | VRAM 29630MB (gpu0)\n", " step 4420 | loss 0.5221 | acc 75% | margin 0.709 | 1192.9K tok/s | lr 5.89e-04 | VRAM 29630MB (gpu0)\n", " step 4430 | loss 0.5468 | acc 75% | margin 0.732 | 1193.0K tok/s | lr 5.86e-04 | VRAM 29630MB (gpu0)\n", " step 4440 | loss 0.4352 | acc 84% | margin 1.166 | 1192.8K tok/s | lr 5.83e-04 | VRAM 29630MB (gpu0)\n", " step 4450 | loss 0.4267 | acc 82% | margin 1.185 | 1192.8K tok/s | lr 5.79e-04 | VRAM 29630MB (gpu0)\n", " step 4460 | loss 0.4363 | acc 80% | margin 1.197 | 1192.5K tok/s | lr 5.76e-04 | VRAM 29630MB (gpu0)\n", " step 4470 | loss 0.4438 | acc 86% | margin 0.980 | 1192.8K tok/s | lr 5.72e-04 | VRAM 29630MB (gpu0)\n", " step 4480 | loss 0.6370 | acc 72% | margin 0.638 | 1192.4K tok/s | lr 5.69e-04 | VRAM 29630MB (gpu0)\n", " step 4490 | loss 0.4707 | acc 74% | margin 0.913 | 1192.6K tok/s | lr 5.65e-04 | VRAM 29630MB (gpu0)\n", " step 4500 | loss 0.4491 | acc 85% | margin 1.101 | 1139.6K tok/s | lr 5.62e-04 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.4901 | acc 78% | margin 0.919 ★ best\n", " step 4510 | loss 0.5032 | acc 72% | margin 0.869 | 434.8K tok/s | lr 5.58e-04 | VRAM 29630MB (gpu0)\n", " step 4520 | loss 0.4093 | acc 89% | margin 1.068 | 1192.7K tok/s | lr 5.55e-04 | VRAM 29630MB (gpu0)\n", " step 4530 | loss 0.3995 | acc 85% | margin 1.160 | 1192.4K tok/s | lr 5.51e-04 | VRAM 29630MB (gpu0)\n", " step 4540 | loss 0.4206 | acc 88% | margin 1.112 | 1192.1K tok/s | lr 5.48e-04 | VRAM 29630MB (gpu0)\n", " step 4550 | loss 0.4962 | acc 72% | margin 0.858 | 1191.2K tok/s | lr 5.44e-04 | VRAM 29630MB (gpu0)\n", " step 4560 | loss 0.4442 | acc 82% | margin 1.060 | 1192.5K tok/s | lr 5.41e-04 | VRAM 29630MB (gpu0)\n", " step 4570 | loss 0.4743 | acc 89% | margin 0.811 | 1192.7K tok/s | lr 5.38e-04 | VRAM 29630MB (gpu0)\n", " step 4580 | loss 0.4409 | acc 81% | margin 1.001 | 1192.8K tok/s | lr 5.34e-04 | VRAM 29630MB (gpu0)\n", " step 4590 | loss 0.4526 | acc 82% | margin 1.026 | 1193.0K tok/s | lr 5.31e-04 | VRAM 29630MB (gpu0)\n", " step 4600 | loss 0.4529 | acc 85% | margin 0.939 | 1139.6K tok/s | lr 5.27e-04 | VRAM 29630MB (gpu0)\n", " step 4610 | loss 0.4597 | acc 81% | margin 0.972 | 1193.4K tok/s | lr 5.24e-04 | VRAM 29630MB (gpu0)\n", " step 4620 | loss 0.5309 | acc 70% | margin 0.813 | 1193.0K tok/s | lr 5.20e-04 | VRAM 29630MB (gpu0)\n", " step 4630 | loss 0.4680 | acc 79% | margin 0.971 | 1192.9K tok/s | lr 5.17e-04 | VRAM 29630MB (gpu0)\n", " step 4640 | loss 0.6277 | acc 78% | margin 0.860 | 1193.0K tok/s | lr 5.13e-04 | VRAM 29630MB (gpu0)\n", " step 4650 | loss 0.4642 | acc 78% | margin 1.025 | 1193.4K tok/s | lr 5.10e-04 | VRAM 29630MB (gpu0)\n", " step 4660 | loss 0.4654 | acc 78% | margin 0.809 | 1194.0K tok/s | lr 5.06e-04 | VRAM 29630MB (gpu0)\n", " step 4670 | loss 0.5027 | acc 74% | margin 0.738 | 1193.6K tok/s | lr 5.03e-04 | VRAM 29630MB (gpu0)\n", " step 4680 | loss 0.4549 | acc 78% | margin 1.066 | 1193.4K tok/s | lr 5.00e-04 | VRAM 29630MB (gpu0)\n", " step 4690 | loss 0.5218 | acc 75% | margin 0.803 | 1193.3K tok/s | lr 4.96e-04 | VRAM 29630MB (gpu0)\n", " step 4700 | loss 0.5260 | acc 76% | margin 0.790 | 1140.0K tok/s | lr 4.93e-04 | VRAM 29630MB (gpu0)\n", " step 4710 | loss 0.4902 | acc 78% | margin 0.942 | 1193.1K tok/s | lr 4.89e-04 | VRAM 29630MB (gpu0)\n", " step 4720 | loss 0.5888 | acc 79% | margin 0.748 | 1192.8K tok/s | lr 4.86e-04 | VRAM 29630MB (gpu0)\n", " step 4730 | loss 0.4681 | acc 84% | margin 0.846 | 1193.0K tok/s | lr 4.82e-04 | VRAM 29630MB (gpu0)\n", " step 4740 | loss 0.4607 | acc 79% | margin 1.112 | 1193.0K tok/s | lr 4.79e-04 | VRAM 29630MB (gpu0)\n", " step 4750 | loss 0.4273 | acc 82% | margin 1.141 | 1192.9K tok/s | lr 4.75e-04 | VRAM 29630MB (gpu0)\n", " step 4760 | loss 0.5340 | acc 71% | margin 0.703 | 1193.0K tok/s | lr 4.72e-04 | VRAM 29630MB (gpu0)\n", " step 4770 | loss 0.4423 | acc 80% | margin 0.931 | 1192.8K tok/s | lr 4.68e-04 | VRAM 29630MB (gpu0)\n", " step 4780 | loss 0.5145 | acc 74% | margin 0.922 | 1192.6K tok/s | lr 4.65e-04 | VRAM 29630MB (gpu0)\n", " step 4790 | loss 0.5250 | acc 79% | margin 0.794 | 1192.5K tok/s | lr 4.62e-04 | VRAM 29630MB (gpu0)\n", " step 4800 | loss 0.4753 | acc 84% | margin 0.824 | 1139.4K tok/s | lr 4.58e-04 | VRAM 29630MB (gpu0)\n", " step 4810 | loss 0.4338 | acc 84% | margin 0.943 | 1193.0K tok/s | lr 4.55e-04 | VRAM 29630MB (gpu0)\n", " step 4820 | loss 0.5476 | acc 76% | margin 0.545 | 1193.1K tok/s | lr 4.51e-04 | VRAM 29630MB (gpu0)\n", " step 4830 | loss 0.5117 | acc 81% | margin 0.721 | 1192.8K tok/s | lr 4.48e-04 | VRAM 29630MB (gpu0)\n", " step 4840 | loss 0.5161 | acc 79% | margin 0.636 | 1192.9K tok/s | lr 4.44e-04 | VRAM 29630MB (gpu0)\n", " step 4850 | loss 0.4498 | acc 85% | margin 0.933 | 1192.7K tok/s | lr 4.41e-04 | VRAM 29630MB (gpu0)\n", " step 4860 | loss 0.5247 | acc 76% | margin 0.771 | 1192.8K tok/s | lr 4.37e-04 | VRAM 29630MB (gpu0)\n", " step 4870 | loss 0.4482 | acc 82% | margin 1.092 | 1192.9K tok/s | lr 4.34e-04 | VRAM 29630MB (gpu0)\n", " step 4880 | loss 0.4515 | acc 84% | margin 1.179 | 1192.7K tok/s | lr 4.30e-04 | VRAM 29630MB (gpu0)\n", " step 4890 | loss 0.4886 | acc 76% | margin 0.846 | 1192.9K tok/s | lr 4.27e-04 | VRAM 29630MB (gpu0)\n", " step 4900 | loss 0.4467 | acc 86% | margin 1.063 | 1139.9K tok/s | lr 4.23e-04 | VRAM 29630MB (gpu0)\n", " step 4910 | loss 0.4907 | acc 80% | margin 0.964 | 1192.8K tok/s | lr 4.20e-04 | VRAM 29630MB (gpu0)\n", " step 4920 | loss 0.4899 | acc 79% | margin 0.737 | 1192.7K tok/s | lr 4.17e-04 | VRAM 29630MB (gpu0)\n", " step 4930 | loss 0.5764 | acc 75% | margin 0.542 | 1193.3K tok/s | lr 4.13e-04 | VRAM 29630MB (gpu0)\n", " step 4940 | loss 0.4630 | acc 85% | margin 0.832 | 1193.4K tok/s | lr 4.10e-04 | VRAM 29630MB (gpu0)\n", " step 4950 | loss 0.4599 | acc 86% | margin 0.907 | 1193.4K tok/s | lr 4.06e-04 | VRAM 29630MB (gpu0)\n", " step 4960 | loss 0.4373 | acc 82% | margin 1.027 | 1193.2K tok/s | lr 4.03e-04 | VRAM 29630MB (gpu0)\n", " step 4970 | loss 0.4344 | acc 85% | margin 1.082 | 1192.7K tok/s | lr 3.99e-04 | VRAM 29630MB (gpu0)\n", " step 4980 | loss 0.5290 | acc 80% | margin 0.818 | 1192.7K tok/s | lr 3.96e-04 | VRAM 29630MB (gpu0)\n", " step 4990 | loss 0.4254 | acc 80% | margin 1.208 | 1192.5K tok/s | lr 3.92e-04 | VRAM 29630MB (gpu0)\n", " step 5000 | loss 0.5259 | acc 74% | margin 0.675 | 1139.5K tok/s | lr 3.89e-04 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5048 | acc 77% | margin 0.916\n", " step 5010 | loss 0.4336 | acc 76% | margin 1.258 | 433.0K tok/s | lr 3.85e-04 | VRAM 29630MB (gpu0)\n", " step 5020 | loss 0.4769 | acc 82% | margin 0.797 | 1192.5K tok/s | lr 3.82e-04 | VRAM 29630MB (gpu0)\n", " step 5030 | loss 0.4420 | acc 81% | margin 1.041 | 1192.6K tok/s | lr 3.79e-04 | VRAM 29630MB (gpu0)\n", " step 5040 | loss 0.5071 | acc 72% | margin 0.717 | 1192.4K tok/s | lr 3.75e-04 | VRAM 29630MB (gpu0)\n", " step 5050 | loss 0.5156 | acc 74% | margin 0.837 | 1192.8K tok/s | lr 3.72e-04 | VRAM 29630MB (gpu0)\n", " step 5060 | loss 0.4979 | acc 82% | margin 0.771 | 1192.4K tok/s | lr 3.68e-04 | VRAM 29630MB (gpu0)\n", " step 5070 | loss 0.4856 | acc 79% | margin 0.862 | 1192.6K tok/s | lr 3.65e-04 | VRAM 29630MB (gpu0)\n", " step 5080 | loss 0.4479 | acc 81% | margin 0.995 | 1192.7K tok/s | lr 3.61e-04 | VRAM 29630MB (gpu0)\n", " step 5090 | loss 0.4885 | acc 85% | margin 0.810 | 1192.9K tok/s | lr 3.58e-04 | VRAM 29630MB (gpu0)\n", " step 5100 | loss 0.4650 | acc 74% | margin 1.312 | 1139.8K tok/s | lr 3.54e-04 | VRAM 29630MB (gpu0)\n", " step 5110 | loss 0.4749 | acc 79% | margin 0.835 | 1192.9K tok/s | lr 3.51e-04 | VRAM 29630MB (gpu0)\n", " step 5120 | loss 0.4098 | acc 92% | margin 1.182 | 1192.8K tok/s | lr 3.47e-04 | VRAM 29630MB (gpu0)\n", " step 5130 | loss 0.8907 | acc 74% | margin 0.346 | 1192.8K tok/s | lr 3.44e-04 | VRAM 29630MB (gpu0)\n", " step 5140 | loss 0.4254 | acc 85% | margin 1.498 | 1192.4K tok/s | lr 3.41e-04 | VRAM 29630MB (gpu0)\n", " step 5150 | loss 0.5433 | acc 71% | margin 0.770 | 1192.6K tok/s | lr 3.37e-04 | VRAM 29630MB (gpu0)\n", " step 5160 | loss 0.5474 | acc 76% | margin 0.721 | 1192.8K tok/s | lr 3.34e-04 | VRAM 29630MB (gpu0)\n", " step 5170 | loss 0.4941 | acc 79% | margin 0.852 | 1192.6K tok/s | lr 3.30e-04 | VRAM 29630MB (gpu0)\n", " step 5180 | loss 0.4681 | acc 81% | margin 0.898 | 1192.9K tok/s | lr 3.27e-04 | VRAM 29630MB (gpu0)\n", " step 5190 | loss 0.4646 | acc 82% | margin 0.924 | 1192.8K tok/s | lr 3.23e-04 | VRAM 29630MB (gpu0)\n", " step 5200 | loss 0.4775 | acc 76% | margin 0.985 | 1139.8K tok/s | lr 3.20e-04 | VRAM 29630MB (gpu0)\n", " step 5210 | loss 0.4202 | acc 81% | margin 1.387 | 1192.9K tok/s | lr 3.16e-04 | VRAM 29630MB (gpu0)\n", " step 5220 | loss 0.4788 | acc 79% | margin 1.063 | 1192.7K tok/s | lr 3.13e-04 | VRAM 29630MB (gpu0)\n", " step 5230 | loss 0.4792 | acc 79% | margin 0.931 | 1193.0K tok/s | lr 3.09e-04 | VRAM 29630MB (gpu0)\n", " step 5240 | loss 0.4858 | acc 84% | margin 0.935 | 1192.8K tok/s | lr 3.06e-04 | VRAM 29630MB (gpu0)\n", " step 5250 | loss 0.4732 | acc 86% | margin 0.777 | 1192.8K tok/s | lr 3.02e-04 | VRAM 29630MB (gpu0)\n", " step 5260 | loss 0.4754 | acc 84% | margin 0.982 | 1192.4K tok/s | lr 2.99e-04 | VRAM 29630MB (gpu0)\n", " step 5270 | loss 0.5645 | acc 78% | margin 0.762 | 1192.7K tok/s | lr 2.96e-04 | VRAM 29630MB (gpu0)\n", " step 5280 | loss 0.7575 | acc 78% | margin 0.791 | 1192.8K tok/s | lr 2.92e-04 | VRAM 29630MB (gpu0)\n", " step 5290 | loss 0.6849 | acc 69% | margin 0.498 | 1192.8K tok/s | lr 2.89e-04 | VRAM 29630MB (gpu0)\n", " step 5300 | loss 0.4204 | acc 88% | margin 1.050 | 1139.5K tok/s | lr 2.85e-04 | VRAM 29630MB (gpu0)\n", " step 5310 | loss 0.5295 | acc 75% | margin 0.724 | 1192.3K tok/s | lr 2.82e-04 | VRAM 29630MB (gpu0)\n", " step 5320 | loss 0.4862 | acc 79% | margin 1.008 | 1192.2K tok/s | lr 2.78e-04 | VRAM 29630MB (gpu0)\n", " step 5330 | loss 0.4301 | acc 85% | margin 0.987 | 1192.3K tok/s | lr 2.75e-04 | VRAM 29630MB (gpu0)\n", " step 5340 | loss 0.4986 | acc 76% | margin 0.862 | 1192.2K tok/s | lr 2.71e-04 | VRAM 29630MB (gpu0)\n", " step 5350 | loss 0.4404 | acc 81% | margin 1.041 | 1192.3K tok/s | lr 2.68e-04 | VRAM 29630MB (gpu0)\n", " step 5360 | loss 0.5329 | acc 81% | margin 0.848 | 1192.8K tok/s | lr 2.64e-04 | VRAM 29630MB (gpu0)\n", " step 5370 | loss 0.5031 | acc 76% | margin 0.932 | 1192.9K tok/s | lr 2.61e-04 | VRAM 29630MB (gpu0)\n", " step 5380 | loss 0.5076 | acc 76% | margin 0.795 | 1193.1K tok/s | lr 2.58e-04 | VRAM 29630MB (gpu0)\n", " step 5390 | loss 0.5220 | acc 76% | margin 0.654 | 1193.0K tok/s | lr 2.54e-04 | VRAM 29630MB (gpu0)\n", " step 5400 | loss 0.4477 | acc 81% | margin 1.006 | 1139.8K tok/s | lr 2.51e-04 | VRAM 29630MB (gpu0)\n", " step 5410 | loss 0.4705 | acc 76% | margin 0.956 | 1192.9K tok/s | lr 2.47e-04 | VRAM 29630MB (gpu0)\n", " step 5420 | loss 0.4315 | acc 81% | margin 1.125 | 1193.0K tok/s | lr 2.44e-04 | VRAM 29630MB (gpu0)\n", " step 5430 | loss 0.5131 | acc 81% | margin 0.652 | 1193.4K tok/s | lr 2.40e-04 | VRAM 29630MB (gpu0)\n", " step 5440 | loss 0.4305 | acc 84% | margin 1.040 | 1193.6K tok/s | lr 2.37e-04 | VRAM 29630MB (gpu0)\n", " step 5450 | loss 0.4888 | acc 74% | margin 0.741 | 1193.5K tok/s | lr 2.33e-04 | VRAM 29630MB (gpu0)\n", " step 5460 | loss 0.4608 | acc 80% | margin 1.061 | 1193.2K tok/s | lr 2.30e-04 | VRAM 29630MB (gpu0)\n", " step 5470 | loss 0.4670 | acc 81% | margin 1.117 | 1193.1K tok/s | lr 2.26e-04 | VRAM 29630MB (gpu0)\n", " step 5480 | loss 0.5116 | acc 78% | margin 0.760 | 1193.4K tok/s | lr 2.23e-04 | VRAM 29630MB (gpu0)\n", " step 5490 | loss 0.4570 | acc 78% | margin 1.216 | 1193.5K tok/s | lr 2.20e-04 | VRAM 29630MB (gpu0)\n", " step 5500 | loss 0.4632 | acc 80% | margin 0.802 | 1139.9K tok/s | lr 2.16e-04 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.5025 | acc 75% | margin 0.933\n", " step 5510 | loss 0.4751 | acc 82% | margin 0.769 | 430.7K tok/s | lr 2.13e-04 | VRAM 29630MB (gpu0)\n", " step 5520 | loss 0.5045 | acc 79% | margin 0.691 | 1192.8K tok/s | lr 2.09e-04 | VRAM 29630MB (gpu0)\n", " step 5530 | loss 0.4760 | acc 82% | margin 0.936 | 1192.8K tok/s | lr 2.06e-04 | VRAM 29630MB (gpu0)\n", " step 5540 | loss 0.4429 | acc 80% | margin 1.039 | 1192.8K tok/s | lr 2.02e-04 | VRAM 29630MB (gpu0)\n", " step 5550 | loss 0.4615 | acc 82% | margin 1.006 | 1192.7K tok/s | lr 1.99e-04 | VRAM 29630MB (gpu0)\n", " step 5560 | loss 0.4616 | acc 78% | margin 1.209 | 1192.6K tok/s | lr 1.95e-04 | VRAM 29630MB (gpu0)\n", " step 5570 | loss 0.4751 | acc 81% | margin 0.993 | 1192.6K tok/s | lr 1.92e-04 | VRAM 29630MB (gpu0)\n", " step 5580 | loss 0.5856 | acc 79% | margin 0.825 | 1192.5K tok/s | lr 1.88e-04 | VRAM 29630MB (gpu0)\n", " step 5590 | loss 0.4234 | acc 86% | margin 0.986 | 1192.5K tok/s | lr 1.85e-04 | VRAM 29630MB (gpu0)\n", " step 5600 | loss 0.4222 | acc 81% | margin 1.112 | 1139.6K tok/s | lr 1.81e-04 | VRAM 29630MB (gpu0)\n", " step 5610 | loss 0.6683 | acc 74% | margin 0.743 | 1192.5K tok/s | lr 1.78e-04 | VRAM 29630MB (gpu0)\n", " step 5620 | loss 0.4363 | acc 81% | margin 1.213 | 1192.7K tok/s | lr 1.75e-04 | VRAM 29630MB (gpu0)\n", " step 5630 | loss 0.5177 | acc 78% | margin 0.834 | 1192.5K tok/s | lr 1.71e-04 | VRAM 29630MB (gpu0)\n", " step 5640 | loss 0.5908 | acc 82% | margin 0.965 | 1192.6K tok/s | lr 1.68e-04 | VRAM 29630MB (gpu0)\n", " step 5650 | loss 0.4612 | acc 79% | margin 0.989 | 1192.7K tok/s | lr 1.64e-04 | VRAM 29630MB (gpu0)\n", " step 5660 | loss 0.4238 | acc 80% | margin 1.126 | 1192.8K tok/s | lr 1.61e-04 | VRAM 29630MB (gpu0)\n", " step 5670 | loss 0.4665 | acc 80% | margin 1.002 | 1192.6K tok/s | lr 1.57e-04 | VRAM 29630MB (gpu0)\n", " step 5680 | loss 0.4013 | acc 89% | margin 1.352 | 1192.9K tok/s | lr 1.54e-04 | VRAM 29630MB (gpu0)\n", " step 5690 | loss 0.4654 | acc 80% | margin 0.951 | 1193.1K tok/s | lr 1.50e-04 | VRAM 29630MB (gpu0)\n", " step 5700 | loss 0.4792 | acc 79% | margin 0.973 | 1140.0K tok/s | lr 1.47e-04 | VRAM 29630MB (gpu0)\n", " step 5710 | loss 0.4401 | acc 82% | margin 1.172 | 1193.5K tok/s | lr 1.43e-04 | VRAM 29630MB (gpu0)\n", " step 5720 | loss 0.5221 | acc 75% | margin 0.833 | 1192.8K tok/s | lr 1.40e-04 | VRAM 29630MB (gpu0)\n", " step 5730 | loss 0.4879 | acc 85% | margin 0.821 | 1192.9K tok/s | lr 1.37e-04 | VRAM 29630MB (gpu0)\n", " step 5740 | loss 0.4441 | acc 80% | margin 0.917 | 1193.0K tok/s | lr 1.33e-04 | VRAM 29630MB (gpu0)\n", " step 5750 | loss 0.4608 | acc 82% | margin 1.098 | 1192.9K tok/s | lr 1.30e-04 | VRAM 29630MB (gpu0)\n", " step 5760 | loss 0.4667 | acc 74% | margin 1.031 | 1192.7K tok/s | lr 1.26e-04 | VRAM 29630MB (gpu0)\n", " step 5770 | loss 0.4583 | acc 82% | margin 1.020 | 1192.6K tok/s | lr 1.23e-04 | VRAM 29630MB (gpu0)\n", " step 5780 | loss 0.4801 | acc 74% | margin 1.193 | 1192.7K tok/s | lr 1.19e-04 | VRAM 29630MB (gpu0)\n", " step 5790 | loss 0.4553 | acc 82% | margin 1.045 | 1192.7K tok/s | lr 1.16e-04 | VRAM 29630MB (gpu0)\n", " step 5800 | loss 0.5678 | acc 79% | margin 0.811 | 1139.6K tok/s | lr 1.12e-04 | VRAM 29630MB (gpu0)\n", " step 5810 | loss 0.4693 | acc 82% | margin 0.889 | 1192.3K tok/s | lr 1.09e-04 | VRAM 29630MB (gpu0)\n", " step 5820 | loss 0.4967 | acc 74% | margin 0.833 | 1192.0K tok/s | lr 1.05e-04 | VRAM 29630MB (gpu0)\n", " step 5830 | loss 0.5221 | acc 81% | margin 0.655 | 1192.3K tok/s | lr 1.02e-04 | VRAM 29630MB (gpu0)\n", " step 5840 | loss 0.4524 | acc 82% | margin 1.295 | 1192.0K tok/s | lr 9.85e-05 | VRAM 29630MB (gpu0)\n", " step 5850 | loss 0.5215 | acc 75% | margin 0.851 | 1192.2K tok/s | lr 9.51e-05 | VRAM 29630MB (gpu0)\n", " step 5860 | loss 0.4917 | acc 78% | margin 0.896 | 1192.5K tok/s | lr 9.16e-05 | VRAM 29630MB (gpu0)\n", " step 5870 | loss 0.5111 | acc 72% | margin 0.878 | 1192.6K tok/s | lr 8.82e-05 | VRAM 29630MB (gpu0)\n", " step 5880 | loss 0.5098 | acc 75% | margin 0.977 | 1192.6K tok/s | lr 8.47e-05 | VRAM 29630MB (gpu0)\n", " step 5890 | loss 0.4718 | acc 80% | margin 0.711 | 1192.7K tok/s | lr 8.12e-05 | VRAM 29630MB (gpu0)\n", " step 5900 | loss 0.4693 | acc 76% | margin 1.091 | 1140.1K tok/s | lr 7.78e-05 | VRAM 29630MB (gpu0)\n", " step 5910 | loss 0.4885 | acc 81% | margin 0.979 | 1192.8K tok/s | lr 7.43e-05 | VRAM 29630MB (gpu0)\n", " step 5920 | loss 0.3977 | acc 89% | margin 1.189 | 1192.3K tok/s | lr 7.09e-05 | VRAM 29630MB (gpu0)\n", " step 5930 | loss 0.3960 | acc 85% | margin 1.200 | 1192.5K tok/s | lr 6.74e-05 | VRAM 29630MB (gpu0)\n", " step 5940 | loss 0.4902 | acc 75% | margin 0.948 | 1192.4K tok/s | lr 6.40e-05 | VRAM 29630MB (gpu0)\n", " step 5950 | loss 0.6200 | acc 79% | margin 0.491 | 1192.9K tok/s | lr 6.05e-05 | VRAM 29630MB (gpu0)\n", " step 5960 | loss 0.4909 | acc 84% | margin 1.005 | 1193.0K tok/s | lr 5.70e-05 | VRAM 29630MB (gpu0)\n", " step 5970 | loss 0.4367 | acc 86% | margin 0.900 | 1193.0K tok/s | lr 5.36e-05 | VRAM 29630MB (gpu0)\n", " step 5980 | loss 0.5151 | acc 75% | margin 0.903 | 1193.3K tok/s | lr 5.01e-05 | VRAM 29630MB (gpu0)\n", " step 5990 | loss 0.4746 | acc 79% | margin 0.829 | 1193.4K tok/s | lr 4.67e-05 | VRAM 29630MB (gpu0)\n", " step 6000 | loss 0.4616 | acc 86% | margin 0.929 | 1140.3K tok/s | lr 4.32e-05 | VRAM 29630MB (gpu0)\n", " >>> eval loss 0.4942 | acc 76% | margin 0.938\n", " >>> saved checkpoints/dpo/dpo_step_0006000.pt\n", " step 6010 | loss 0.4146 | acc 82% | margin 1.302 | 432.6K tok/s | lr 3.98e-05 | VRAM 29630MB (gpu0)\n", " step 6020 | loss 0.4655 | acc 78% | margin 0.868 | 1193.2K tok/s | lr 3.63e-05 | VRAM 29630MB (gpu0)\n", " step 6030 | loss 0.5049 | acc 76% | margin 0.867 | 1193.1K tok/s | lr 3.28e-05 | VRAM 29630MB (gpu0)\n", " step 6040 | loss 0.5227 | acc 76% | margin 0.757 | 1192.9K tok/s | lr 2.94e-05 | VRAM 29630MB (gpu0)\n", " step 6050 | loss 0.4827 | acc 80% | margin 1.012 | 1192.8K tok/s | lr 2.59e-05 | VRAM 29630MB (gpu0)\n", " step 6060 | loss 0.4524 | acc 75% | margin 1.064 | 1193.2K tok/s | lr 2.25e-05 | VRAM 29630MB (gpu0)\n", " step 6070 | loss 0.4935 | acc 79% | margin 0.763 | 1192.8K tok/s | lr 1.90e-05 | VRAM 29630MB (gpu0)\n", " step 6080 | loss 0.4904 | acc 79% | margin 1.178 | 1193.1K tok/s | lr 1.56e-05 | VRAM 29630MB (gpu0)\n", " step 6090 | loss 0.4804 | acc 80% | margin 0.864 | 1192.8K tok/s | lr 1.21e-05 | VRAM 29630MB (gpu0)\n", " step 6100 | loss 0.4958 | acc 75% | margin 0.904 | 1139.8K tok/s | lr 8.64e-06 | VRAM 29630MB (gpu0)\n", " step 6110 | loss 0.4625 | acc 81% | margin 0.974 | 1193.1K tok/s | lr 5.19e-06 | VRAM 29630MB (gpu0)\n", " step 6120 | loss 0.5548 | acc 76% | margin 0.662 | 1193.2K tok/s | lr 1.73e-06 | VRAM 29630MB (gpu0)\n", "\n", "Final eval: loss 0.4960 | acc 76% | margin 0.927\n", "Final DPO checkpoint: checkpoints/dpo/dpo_step_0006124.pt\n", "\n", "======================================================================\n", "DPO complete. 12,842,958,848 tokens processed.\n", "Best eval loss: 0.4901\n", "======================================================================\n" ] } ], "source": [ "!torchrun --nproc_per_node=8 -m freqformer.dpo_train \\\n", " --preset small \\\n", " --distributed ddp \\\n", " --sft_checkpoint checkpoints/sft/sft_step_0023008.pt \\\n", " --data_dir dpo \\\n", " --beta 0.3 \\\n", " --label_smoothing 0.0 \\\n", " --optimizer splus \\\n", " --lr 2e-6 \\\n", " --weight_decay 0.0 \\\n", " --lr_schedule linear \\\n", " --warmup_steps 200 \\\n", " --batch_size 2 \\\n", " --seq_len 16384 \\\n", " --grad_accum_steps 4 \\\n", " --num_epochs 2 \\\n", " --log_every 10 \\\n", " --eval_every 500 \\\n", " --checkpoint_every 2000 \\\n", " --checkpoint_dir checkpoints/dpo" ] }, { "cell_type": "code", "execution_count": null, "id": "12d40460-2f70-4827-af22-400042d445a7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }