Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- math_evaluation/__pycache__/evaluate.cpython-310.pyc +0 -0
- math_evaluation/__pycache__/trajectory.cpython-310.pyc +0 -0
- math_evaluation/outputs/Qwen/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
- math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/LUFFY/data/save_model/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1_qwen-boxed_metrics.json +13 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Full-solution/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2450/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-test-1_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-2/checkpoint-850/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-100/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-500/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full/checkpoint-2650/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
- math_evaluation/sh/eval_1.sh +111 -0
- train/ETrainer.py +599 -0
- train/test_1.py +662 -0
- train/test_on_math.py +71 -0
- train/test_ood_python_lora_rope.py +203 -0
- train/train_qwen_verl_46k.py +0 -0
- train/train_qwen_verl_46k.sh +0 -0
- train/wandb/run-20251113_165350-n56lk6p0/logs/debug-internal.log +6 -0
- train/wandb/run-20251113_171624-kgxigylp/files/wandb-metadata.json +117 -0
- train/wandb/run-20251114_040305-jb702f8e/files/wandb-summary.json +1 -0
- train/wandb/run-20251114_083110-mocjk23v/files/wandb-summary.json +1 -0
- train/wandb/run-20251114_085634-l1whc2fu/logs/debug-core.log +14 -0
- train/wandb/run-20251114_093516-syhj5u87/files/config.yaml +179 -0
- train/wandb/run-20251114_093516-syhj5u87/logs/debug-core.log +12 -0
- train/wandb/run-20251114_103643-cvm4116u/files/config.yaml +698 -0
- train/wandb/run-20251114_103643-cvm4116u/files/output.log +262 -0
- train/wandb/run-20251114_103643-cvm4116u/run-cvm4116u.wandb +0 -0
- train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-metadata.json +162 -0
- train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-summary.json +1 -0
- train/wandb/run-20251114_145219-w9xre5r3/files/config.yaml +659 -0
- train/wandb/run-20251114_145219-w9xre5r3/files/output.log +336 -0
- train/wandb/run-20251114_145219-w9xre5r3/files/requirements.txt +150 -0
- train/wandb/run-20251114_145219-w9xre5r3/files/wandb-metadata.json +129 -0
- train/wandb/run-20251114_145219-w9xre5r3/files/wandb-summary.json +1 -0
- train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log +9 -0
- train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log +26 -0
- train/wandb/run-20251114_145222-i8zbx8vz/files/config.yaml +142 -0
math_evaluation/__pycache__/evaluate.cpython-310.pyc
ADDED
|
Binary file (4.15 kB). View file
|
|
|
math_evaluation/__pycache__/trajectory.cpython-310.pyc
ADDED
|
Binary file (5.37 kB). View file
|
|
|
math_evaluation/outputs/Qwen/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 32,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 59.6,
|
| 7 |
+
"all_acc": [
|
| 8 |
+
59.6
|
| 9 |
+
],
|
| 10 |
+
"mean_acc": 59.6,
|
| 11 |
+
"time_use_in_second": 40.11754846572876,
|
| 12 |
+
"time_use_in_minite": "0:40"
|
| 13 |
+
}
|
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/LUFFY/data/save_model/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"all_acc": [
|
| 8 |
+
6.7
|
| 9 |
+
],
|
| 10 |
+
"mean_acc": 6.7,
|
| 11 |
+
"time_use_in_second": 8.051186084747314,
|
| 12 |
+
"time_use_in_minite": "0:08"
|
| 13 |
+
}
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1_qwen-boxed_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 54,
|
| 5 |
+
"empty_samples": 56,
|
| 6 |
+
"acc": 32.0,
|
| 7 |
+
"all_acc": [
|
| 8 |
+
32.0
|
| 9 |
+
],
|
| 10 |
+
"mean_acc": 32.0,
|
| 11 |
+
"time_use_in_second": 62.29504370689392,
|
| 12 |
+
"time_use_in_minite": "1:02"
|
| 13 |
+
}
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Full-solution/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2450/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 72.6,
|
| 7 |
+
"all_acc": [
|
| 8 |
+
72.6
|
| 9 |
+
],
|
| 10 |
+
"mean_acc": 72.6,
|
| 11 |
+
"time_use_in_second": 18.515631198883057,
|
| 12 |
+
"time_use_in_minite": "0:18"
|
| 13 |
+
}
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-test-1_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-2/checkpoint-850/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-100/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-500/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full/checkpoint-2650/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
math_evaluation/sh/eval_1.sh
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -ex
|
| 2 |
+
|
| 3 |
+
PROMPT_TYPE=$1
|
| 4 |
+
MODEL_NAME_OR_PATH=$2
|
| 5 |
+
OUTPUT_DIR=$3
|
| 6 |
+
n_sampling=$4
|
| 7 |
+
temperature=$5
|
| 8 |
+
|
| 9 |
+
SPLIT="test"
|
| 10 |
+
NUM_TEST_SAMPLE=-1
|
| 11 |
+
|
| 12 |
+
# English open datasets
|
| 13 |
+
DATA_NAME="math_oai,minerva_math,olympiadbench"
|
| 14 |
+
# DATA_NAME="gsm8k,math,svamp,asdiv,mawps,carp_en,tabmwp,minerva_math,gaokao2023en,olympiadbench,college_math"
|
| 15 |
+
TOKENIZERS_PARALLELISM=false \
|
| 16 |
+
python3 -u math_eval.py \
|
| 17 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 18 |
+
--data_name ${DATA_NAME} \
|
| 19 |
+
--output_dir ${OUTPUT_DIR} \
|
| 20 |
+
--split ${SPLIT} \
|
| 21 |
+
--prompt_type ${PROMPT_TYPE} \
|
| 22 |
+
--num_test_sample ${NUM_TEST_SAMPLE} \
|
| 23 |
+
--seed 0 \
|
| 24 |
+
--temperature ${temperature} \
|
| 25 |
+
--n_sampling ${n_sampling} \
|
| 26 |
+
--top_p 1 \
|
| 27 |
+
--start 0 \
|
| 28 |
+
--end -1 \
|
| 29 |
+
--use_vllm
|
| 30 |
+
|
| 31 |
+
# English multiple-choice datasets
|
| 32 |
+
# DATA_NAME="aqua,sat_math,mmlu_stem"
|
| 33 |
+
# TOKENIZERS_PARALLELISM=false \
|
| 34 |
+
# python3 -u math_eval.py \
|
| 35 |
+
# --model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 36 |
+
# --data_name ${DATA_NAME} \
|
| 37 |
+
# --output_dir ${OUTPUT_DIR} \
|
| 38 |
+
# --split ${SPLIT} \
|
| 39 |
+
# --prompt_type ${PROMPT_TYPE} \
|
| 40 |
+
# --num_test_sample ${NUM_TEST_SAMPLE} \
|
| 41 |
+
# --seed 0 \
|
| 42 |
+
# --temperature 0 \
|
| 43 |
+
# --n_sampling 1 \
|
| 44 |
+
# --top_p 1 \
|
| 45 |
+
# --start 0 \
|
| 46 |
+
# --end -1 \
|
| 47 |
+
# --use_vllm \
|
| 48 |
+
# --save_outputs \
|
| 49 |
+
# --overwrite \
|
| 50 |
+
# --num_shots 5
|
| 51 |
+
|
| 52 |
+
# Chinese gaokao collections
|
| 53 |
+
# DATA_NAME="gaokao2024_I,gaokao2024_II,gaokao2024_mix,gaokao_math_cloze,gaokao_math_qa"
|
| 54 |
+
# TOKENIZERS_PARALLELISM=false \
|
| 55 |
+
# python3 -u math_eval.py \
|
| 56 |
+
# --model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 57 |
+
# --data_name ${DATA_NAME} \
|
| 58 |
+
# --output_dir ${OUTPUT_DIR} \
|
| 59 |
+
# --split ${SPLIT} \
|
| 60 |
+
# --prompt_type ${PROMPT_TYPE} \
|
| 61 |
+
# --num_test_sample ${NUM_TEST_SAMPLE} \
|
| 62 |
+
# --seed 0 \
|
| 63 |
+
# --temperature 0 \
|
| 64 |
+
# --n_sampling 1 \
|
| 65 |
+
# --top_p 1 \
|
| 66 |
+
# --start 0 \
|
| 67 |
+
# --end -1 \
|
| 68 |
+
# --use_vllm \
|
| 69 |
+
# --save_outputs \
|
| 70 |
+
# --overwrite \
|
| 71 |
+
# --adapt_few_shot
|
| 72 |
+
|
| 73 |
+
# Chinese other datasets
|
| 74 |
+
# DATA_NAME="cmath,cn_middle_school"
|
| 75 |
+
# TOKENIZERS_PARALLELISM=false \
|
| 76 |
+
# python3 -u math_eval.py \
|
| 77 |
+
# --model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 78 |
+
# --data_name ${DATA_NAME} \
|
| 79 |
+
# --output_dir ${OUTPUT_DIR} \
|
| 80 |
+
# --split ${SPLIT} \
|
| 81 |
+
# --prompt_type ${PROMPT_TYPE} \
|
| 82 |
+
# --num_test_sample ${NUM_TEST_SAMPLE} \
|
| 83 |
+
# --seed 0 \
|
| 84 |
+
# --temperature 0 \
|
| 85 |
+
# --n_sampling 1 \
|
| 86 |
+
# --top_p 1 \
|
| 87 |
+
# --start 0 \
|
| 88 |
+
# --end -1 \
|
| 89 |
+
# --use_vllm \
|
| 90 |
+
# --save_outputs \
|
| 91 |
+
# --overwrite \
|
| 92 |
+
# --adapt_few_shot
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# English competition datasets
|
| 96 |
+
DATA_NAME="aime24,amc23"
|
| 97 |
+
TOKENIZERS_PARALLELISM=false \
|
| 98 |
+
python3 -u math_eval.py \
|
| 99 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 100 |
+
--data_name ${DATA_NAME} \
|
| 101 |
+
--output_dir ${OUTPUT_DIR} \
|
| 102 |
+
--split ${SPLIT} \
|
| 103 |
+
--prompt_type ${PROMPT_TYPE} \
|
| 104 |
+
--num_test_sample ${NUM_TEST_SAMPLE} \
|
| 105 |
+
--seed 0 \
|
| 106 |
+
--temperature ${temperature} \
|
| 107 |
+
--n_sampling ${n_sampling} \
|
| 108 |
+
--top_p 1 \
|
| 109 |
+
--start 0 \
|
| 110 |
+
--end -1 \
|
| 111 |
+
--use_vllm
|
train/ETrainer.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Entropy-weighted Trainer (ETrainer) for knowledge distillation with dynamic loss weighting.
|
| 17 |
+
Based on TRL's SFTTrainer with modifications for teacher-student entropy-based weighting.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import torch
|
| 21 |
+
import torch.nn as nn
|
| 22 |
+
from typing import Any, Dict
|
| 23 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
+
from trl import SFTTrainer
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from trl import SFTConfig
|
| 27 |
+
import transformers
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ETrainerConfig(SFTConfig):
|
| 32 |
+
"""
|
| 33 |
+
Extended SFTConfig with teacher model parameters for entropy-weighted training.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
teacher_model_path (`str`, *optional*):
|
| 37 |
+
Path to the teacher model (e.g., "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B").
|
| 38 |
+
If None, no entropy weighting is applied.
|
| 39 |
+
entropy_weight_alpha (`float`, defaults to 2.0):
|
| 40 |
+
Alpha parameter controlling the sensitivity of entropy difference weighting.
|
| 41 |
+
Higher values make the weighting more sensitive to entropy differences.
|
| 42 |
+
entropy_weight_beta (`float`, defaults to 0.3):
|
| 43 |
+
Beta parameter controlling the zero-point offset for entropy difference.
|
| 44 |
+
Determines the threshold for significant entropy differences.
|
| 45 |
+
use_entropy_weighting (`bool`, defaults to True):
|
| 46 |
+
Whether to apply entropy-based loss weighting. Set to False to use standard loss.
|
| 47 |
+
teacher_dtype (`str`, defaults to "bfloat16"):
|
| 48 |
+
Data type for teacher model ("float16", "bfloat16", or "float32").
|
| 49 |
+
entropy_top_k (`int`, defaults to 64):
|
| 50 |
+
Number of top tokens to use for entropy approximation. Set to None or vocab_size
|
| 51 |
+
to compute exact entropy. Smaller values save memory significantly.
|
| 52 |
+
"""
|
| 53 |
+
teacher_model_path: str | None = field(
|
| 54 |
+
default=None,
|
| 55 |
+
metadata={"help": "Path to teacher model for entropy weighting"}
|
| 56 |
+
)
|
| 57 |
+
entropy_weight_alpha: float = field(
|
| 58 |
+
default=2.0,
|
| 59 |
+
metadata={"help": "Alpha parameter for entropy weighting sensitivity"}
|
| 60 |
+
)
|
| 61 |
+
entropy_weight_beta: float = field(
|
| 62 |
+
default=0.3,
|
| 63 |
+
metadata={"help": "Beta parameter for entropy weighting offset"}
|
| 64 |
+
)
|
| 65 |
+
use_entropy_weighting: bool = field(
|
| 66 |
+
default=True,
|
| 67 |
+
metadata={"help": "Whether to apply entropy-based loss weighting"}
|
| 68 |
+
)
|
| 69 |
+
teacher_dtype: str = field(
|
| 70 |
+
default="bfloat16",
|
| 71 |
+
metadata={"help": "Teacher model dtype (float16/bfloat16/float32)"}
|
| 72 |
+
)
|
| 73 |
+
entropy_top_k: int = field(
|
| 74 |
+
default=64,
|
| 75 |
+
metadata={"help": "Top-K tokens for entropy approximation (reduces memory)"}
|
| 76 |
+
)
|
| 77 |
+
teacher_device_ids: str | None = field(
|
| 78 |
+
default=None,
|
| 79 |
+
metadata={
|
| 80 |
+
"help": "GPU device IDs for teacher model (e.g., '0', '0,1', 'cuda:2'). "
|
| 81 |
+
"If None, uses same device as student. Separate GPUs enable parallel computation."
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class ETrainer(SFTTrainer):
|
| 87 |
+
"""
|
| 88 |
+
Entropy-weighted Trainer for knowledge distillation.
|
| 89 |
+
|
| 90 |
+
This trainer extends SFTTrainer by adding dynamic per-token loss weighting based on
|
| 91 |
+
teacher-student entropy differences. The weight formula is:
|
| 92 |
+
|
| 93 |
+
w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 94 |
+
|
| 95 |
+
Where:
|
| 96 |
+
- H_t(j): Teacher's entropy at position j (lower = more confident)
|
| 97 |
+
- H_s(j): Student's entropy at position j
|
| 98 |
+
- α: Sensitivity parameter (default 2.0)
|
| 99 |
+
- β: Offset parameter (default 0.3)
|
| 100 |
+
|
| 101 |
+
Key features:
|
| 102 |
+
- Higher weight when teacher is confident (low H_t)
|
| 103 |
+
- Higher weight when student differs significantly from teacher (large |ΔH|)
|
| 104 |
+
- Smooth weighting via sigmoid to avoid gradient explosion
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def __init__(self, *args, **kwargs):
|
| 108 |
+
# Extract ETrainer-specific arguments before passing to parent
|
| 109 |
+
args_obj = kwargs.get('args', None)
|
| 110 |
+
|
| 111 |
+
# Initialize parent SFTTrainer
|
| 112 |
+
super().__init__(*args, **kwargs)
|
| 113 |
+
|
| 114 |
+
# Load teacher model if specified
|
| 115 |
+
self.teacher_model = None
|
| 116 |
+
self.use_entropy_weighting = False
|
| 117 |
+
|
| 118 |
+
# Initialize entropy-specific metrics
|
| 119 |
+
# Note: Only initialize metrics we actually populate to avoid ZeroDivisionError
|
| 120 |
+
if not hasattr(self, '_metrics'):
|
| 121 |
+
self._metrics = {'train': {}, 'eval': {}}
|
| 122 |
+
|
| 123 |
+
# Add entropy-specific metric keys
|
| 124 |
+
for mode in ['train', 'eval']:
|
| 125 |
+
if mode not in self._metrics:
|
| 126 |
+
self._metrics[mode] = {}
|
| 127 |
+
# Only initialize avg_weight since we're not logging entropy to save memory
|
| 128 |
+
self._metrics[mode].setdefault('avg_weight', [])
|
| 129 |
+
|
| 130 |
+
if args_obj and hasattr(args_obj, 'teacher_model_path') and args_obj.teacher_model_path:
|
| 131 |
+
self.use_entropy_weighting = args_obj.use_entropy_weighting
|
| 132 |
+
self.entropy_weight_alpha = args_obj.entropy_weight_alpha
|
| 133 |
+
self.entropy_weight_beta = args_obj.entropy_weight_beta
|
| 134 |
+
self.entropy_top_k = getattr(args_obj, 'entropy_top_k', 64)
|
| 135 |
+
|
| 136 |
+
if self.use_entropy_weighting:
|
| 137 |
+
print(f"🎓 Loading teacher model: {args_obj.teacher_model_path}")
|
| 138 |
+
print(f"📊 Entropy weighting params: α={self.entropy_weight_alpha}, β={self.entropy_weight_beta}")
|
| 139 |
+
print(f"💾 Entropy computation: top-k={self.entropy_top_k} (memory-efficient mode)")
|
| 140 |
+
|
| 141 |
+
# Load teacher model to specified device(s)
|
| 142 |
+
self.teacher_device = self._load_teacher_model(args_obj)
|
| 143 |
+
|
| 144 |
+
print("✅ Teacher model loaded and frozen")
|
| 145 |
+
|
| 146 |
+
# Freeze teacher model
|
| 147 |
+
for param in self.teacher_model.parameters():
|
| 148 |
+
param.requires_grad = False
|
| 149 |
+
|
| 150 |
+
print("✅ Teacher model loaded and frozen")
|
| 151 |
+
|
| 152 |
+
# --- Fix teacher tokenizer/model vocab mismatch ---
|
| 153 |
+
print("\n🔍 Checking teacher tokenizer/model alignment...")
|
| 154 |
+
teacher_tokenizer = AutoTokenizer.from_pretrained(
|
| 155 |
+
args_obj.teacher_model_path,
|
| 156 |
+
trust_remote_code=True
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
tokenizer_vocab = len(teacher_tokenizer)
|
| 160 |
+
model_vocab = self.teacher_model.config.vocab_size
|
| 161 |
+
|
| 162 |
+
print(f"📌 Teacher tokenizer vocab: {tokenizer_vocab}")
|
| 163 |
+
print(f"📌 Teacher model vocab: {model_vocab}")
|
| 164 |
+
|
| 165 |
+
if tokenizer_vocab != model_vocab:
|
| 166 |
+
print("⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...")
|
| 167 |
+
self.teacher_model.resize_token_embeddings(tokenizer_vocab)
|
| 168 |
+
self.teacher_model.config.vocab_size = tokenizer_vocab
|
| 169 |
+
print(f"✅ Teacher embeddings resized to {tokenizer_vocab}")
|
| 170 |
+
else:
|
| 171 |
+
print("✅ Teacher tokenizer and model vocab already match")
|
| 172 |
+
|
| 173 |
+
# --- Now align student with teacher ---
|
| 174 |
+
print(f"\n📊 Student model vocab size: {self.model.config.vocab_size}")
|
| 175 |
+
print(f"📊 Teacher model vocab size (after alignment): {self.teacher_model.config.vocab_size}")
|
| 176 |
+
|
| 177 |
+
# Handle vocab size mismatch by resizing student embeddings to match teacher
|
| 178 |
+
if self.teacher_model.config.vocab_size != self.model.config.vocab_size:
|
| 179 |
+
print(f"\n⚠️ Student/Teacher vocab size mismatch detected!")
|
| 180 |
+
print(f" Teacher: {self.teacher_model.config.vocab_size}")
|
| 181 |
+
print(f" Student: {self.model.config.vocab_size}")
|
| 182 |
+
print(f"🔧 Resizing student embeddings to match teacher...")
|
| 183 |
+
|
| 184 |
+
# Resize student model embeddings (new tokens initialized with mean of existing)
|
| 185 |
+
self.model.resize_token_embeddings(self.teacher_model.config.vocab_size)
|
| 186 |
+
|
| 187 |
+
print(f"✅ Student embeddings resized to {self.teacher_model.config.vocab_size}")
|
| 188 |
+
print(f" New student vocab size: {self.model.config.vocab_size}")
|
| 189 |
+
else:
|
| 190 |
+
print(f"✅ Student and teacher vocab sizes already match: {self.model.config.vocab_size}")
|
| 191 |
+
|
| 192 |
+
print("\n" + "="*60)
|
| 193 |
+
print(f"🎯 Final Vocab Alignment Complete")
|
| 194 |
+
print(f"📊 Teacher vocab size: {self.teacher_model.config.vocab_size}")
|
| 195 |
+
print(f"📊 Student vocab size: {self.model.config.vocab_size}")
|
| 196 |
+
print("="*60)
|
| 197 |
+
|
| 198 |
+
def _parse_teacher_devices(self, teacher_device_ids: str | None) -> str | list:
|
| 199 |
+
"""
|
| 200 |
+
Parse teacher device IDs string into device specification.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
teacher_device_ids: Device string like "0", "0,1", "cuda:2", or None
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
Single device string (e.g., "cuda:0") or list of devices for multi-GPU
|
| 207 |
+
"""
|
| 208 |
+
if teacher_device_ids is None:
|
| 209 |
+
# Default: use cuda:0 if available
|
| 210 |
+
return "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 211 |
+
|
| 212 |
+
# Clean up the input
|
| 213 |
+
teacher_device_ids = teacher_device_ids.strip()
|
| 214 |
+
|
| 215 |
+
# Handle comma-separated multi-GPU case
|
| 216 |
+
if ',' in teacher_device_ids:
|
| 217 |
+
device_list = [f"cuda:{id.strip()}" if not id.strip().startswith('cuda:')
|
| 218 |
+
else id.strip()
|
| 219 |
+
for id in teacher_device_ids.split(',')]
|
| 220 |
+
return device_list
|
| 221 |
+
|
| 222 |
+
# Single device case
|
| 223 |
+
if not teacher_device_ids.startswith('cuda:'):
|
| 224 |
+
return f"cuda:{teacher_device_ids}"
|
| 225 |
+
return teacher_device_ids
|
| 226 |
+
|
| 227 |
+
def _load_teacher_model(self, args_obj):
|
| 228 |
+
"""
|
| 229 |
+
Load teacher model to specified device(s).
|
| 230 |
+
|
| 231 |
+
This method handles both single-GPU and multi-GPU teacher configurations.
|
| 232 |
+
"""
|
| 233 |
+
# Parse device IDs: "1" / "0,1" / "cuda:3"
|
| 234 |
+
teacher_dev = getattr(args_obj, "teacher_device_ids", None)
|
| 235 |
+
teacher_dev = self._parse_teacher_devices(teacher_dev)
|
| 236 |
+
|
| 237 |
+
# Determine teacher dtype
|
| 238 |
+
dtype_map = {
|
| 239 |
+
"float16": torch.float16,
|
| 240 |
+
"bfloat16": torch.bfloat16,
|
| 241 |
+
"float32": torch.float32,
|
| 242 |
+
}
|
| 243 |
+
teacher_dtype = dtype_map.get(args_obj.teacher_dtype, torch.bfloat16)
|
| 244 |
+
|
| 245 |
+
print(f"🖥️ Teacher target device(s): {teacher_dev}")
|
| 246 |
+
|
| 247 |
+
# Case A: multi-GPU teacher
|
| 248 |
+
if isinstance(teacher_dev, list):
|
| 249 |
+
print(f"📡 Using multi-GPU teacher (HF auto parallel): {teacher_dev}")
|
| 250 |
+
|
| 251 |
+
# Let HuggingFace accelerate handle device distribution automatically
|
| 252 |
+
self.teacher_model = AutoModelForCausalLM.from_pretrained(
|
| 253 |
+
args_obj.teacher_model_path,
|
| 254 |
+
torch_dtype=teacher_dtype,
|
| 255 |
+
trust_remote_code=True,
|
| 256 |
+
device_map="auto", # Let accelerate decide optimal sharding
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Case B: single GPU teacher
|
| 260 |
+
else:
|
| 261 |
+
device_str = teacher_dev
|
| 262 |
+
print(f"📡 Loading teacher on single GPU: {device_str}")
|
| 263 |
+
|
| 264 |
+
# Load to CPU first, then move to target GPU
|
| 265 |
+
self.teacher_model = AutoModelForCausalLM.from_pretrained(
|
| 266 |
+
args_obj.teacher_model_path,
|
| 267 |
+
torch_dtype=teacher_dtype,
|
| 268 |
+
trust_remote_code=True,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Move whole model to target GPU
|
| 272 |
+
self.teacher_model.to(device_str)
|
| 273 |
+
|
| 274 |
+
# Freeze teacher model
|
| 275 |
+
self.teacher_model.eval()
|
| 276 |
+
for param in self.teacher_model.parameters():
|
| 277 |
+
param.requires_grad = False
|
| 278 |
+
|
| 279 |
+
return teacher_dev
|
| 280 |
+
|
| 281 |
+
def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
|
| 282 |
+
mode = "train" if self.model.training else "eval"
|
| 283 |
+
|
| 284 |
+
# Only rank 0 logs
|
| 285 |
+
if torch.distributed.is_available() and torch.distributed.is_initialized():
|
| 286 |
+
if torch.distributed.get_rank() != 0:
|
| 287 |
+
return
|
| 288 |
+
|
| 289 |
+
# Add avg_weight if exists
|
| 290 |
+
if self.use_entropy_weighting:
|
| 291 |
+
if 'avg_weight' in self._metrics[mode] and len(self._metrics[mode]['avg_weight']) > 0:
|
| 292 |
+
logs["avg_weight"] = sum(self._metrics[mode]['avg_weight']) / len(self._metrics[mode]['avg_weight'])
|
| 293 |
+
self._metrics[mode]['avg_weight'] = []
|
| 294 |
+
|
| 295 |
+
# VERY IMPORTANT:
|
| 296 |
+
# Call HF Trainer.log instead of TRL SFTTrainer.log
|
| 297 |
+
transformers.Trainer.log(self, logs)
|
| 298 |
+
|
| 299 |
+
def compute_entropy(self, logits: torch.Tensor, top_k: int = None) -> torch.Tensor:
|
| 300 |
+
"""
|
| 301 |
+
Compute per-token entropy from logits (supports top-k approximation for memory efficiency).
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
logits: Model logits of shape [batch_size, seq_len, vocab_size]
|
| 305 |
+
top_k: If not None and < vocab_size, compute entropy using only top-k tokens.
|
| 306 |
+
This significantly reduces memory usage with minimal impact on weight quality.
|
| 307 |
+
Default uses self.entropy_top_k (set to 64 by default).
|
| 308 |
+
|
| 309 |
+
Returns:
|
| 310 |
+
entropy: Per-token entropy of shape [batch_size, seq_len]
|
| 311 |
+
"""
|
| 312 |
+
# Use instance's top_k setting if not explicitly provided
|
| 313 |
+
if top_k is None:
|
| 314 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 315 |
+
|
| 316 |
+
with torch.no_grad():
|
| 317 |
+
# Use float32 for numerical stability
|
| 318 |
+
logits = logits.float()
|
| 319 |
+
|
| 320 |
+
vocab_size = logits.size(-1)
|
| 321 |
+
|
| 322 |
+
if top_k is not None and top_k < vocab_size:
|
| 323 |
+
# Top-K approximation: only compute entropy on top-k tokens
|
| 324 |
+
# This reduces memory from O(vocab_size) to O(k)
|
| 325 |
+
# Memory: ~151665 -> ~64, ~2400x reduction!
|
| 326 |
+
top_values, _ = torch.topk(logits, k=top_k, dim=-1) # [..., top_k]
|
| 327 |
+
|
| 328 |
+
log_probs = torch.log_softmax(top_values, dim=-1) # [..., top_k]
|
| 329 |
+
probs = torch.softmax(top_values, dim=-1) # [..., top_k]
|
| 330 |
+
|
| 331 |
+
entropy = -(probs * log_probs).sum(dim=-1) # [batch_size, seq_len]
|
| 332 |
+
else:
|
| 333 |
+
# Exact entropy computation (uses full vocab)
|
| 334 |
+
log_probs = torch.log_softmax(logits, dim=-1)
|
| 335 |
+
probs = torch.softmax(logits, dim=-1)
|
| 336 |
+
entropy = -(probs * log_probs).sum(dim=-1)
|
| 337 |
+
|
| 338 |
+
return entropy
|
| 339 |
+
|
| 340 |
+
def compute_entropy_topk(self, logits_topk: torch.Tensor) -> torch.Tensor:
|
| 341 |
+
"""
|
| 342 |
+
Compute entropy from pre-extracted top-k logits (ultra memory-efficient).
|
| 343 |
+
|
| 344 |
+
Args:
|
| 345 |
+
logits_topk: Top-k logits of shape [batch_size, seq_len, k]
|
| 346 |
+
|
| 347 |
+
Returns:
|
| 348 |
+
entropy: Per-token entropy of shape [batch_size, seq_len]
|
| 349 |
+
"""
|
| 350 |
+
with torch.no_grad():
|
| 351 |
+
# logits_topk is already top-k, just compute softmax on it
|
| 352 |
+
logits_topk = logits_topk.float()
|
| 353 |
+
|
| 354 |
+
log_probs = torch.log_softmax(logits_topk, dim=-1)
|
| 355 |
+
probs = torch.softmax(logits_topk, dim=-1)
|
| 356 |
+
|
| 357 |
+
entropy = -(probs * log_probs).sum(dim=-1)
|
| 358 |
+
|
| 359 |
+
return entropy
|
| 360 |
+
|
| 361 |
+
def compute_entropy_weights_topk(
|
| 362 |
+
self,
|
| 363 |
+
student_logits_topk: torch.Tensor,
|
| 364 |
+
teacher_logits_topk: torch.Tensor,
|
| 365 |
+
labels: torch.Tensor,
|
| 366 |
+
) -> torch.Tensor:
|
| 367 |
+
"""
|
| 368 |
+
Compute per-token weights using pre-extracted top-k logits (optimized version).
|
| 369 |
+
|
| 370 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
student_logits_topk: Student top-k logits [batch_size, seq_len, k]
|
| 374 |
+
teacher_logits_topk: Teacher top-k logits [batch_size, seq_len, k]
|
| 375 |
+
labels: Ground truth labels [batch_size, seq_len]
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
weights: Per-token weights [batch_size, seq_len]
|
| 379 |
+
"""
|
| 380 |
+
# Compute entropies from top-k logits (no additional memory allocation)
|
| 381 |
+
H_s = self.compute_entropy_topk(student_logits_topk) # [batch_size, seq_len]
|
| 382 |
+
H_t = self.compute_entropy_topk(teacher_logits_topk) # [batch_size, seq_len]
|
| 383 |
+
|
| 384 |
+
# Compute entropy difference
|
| 385 |
+
delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
|
| 386 |
+
|
| 387 |
+
# Component 1: exp(-H_t) - emphasizes positions where teacher is confident
|
| 388 |
+
teacher_confidence = torch.exp(-H_t)
|
| 389 |
+
|
| 390 |
+
# Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
|
| 391 |
+
disagreement_weight = torch.nn.functional.softplus(
|
| 392 |
+
self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Combined weight
|
| 396 |
+
weights = teacher_confidence * disagreement_weight
|
| 397 |
+
|
| 398 |
+
# Mask out positions where labels are -100 (padding or prompt tokens)
|
| 399 |
+
mask = (labels != -100).float()
|
| 400 |
+
weights = weights * mask
|
| 401 |
+
|
| 402 |
+
return weights
|
| 403 |
+
|
| 404 |
+
def compute_entropy_weights(
|
| 405 |
+
self,
|
| 406 |
+
student_logits: torch.Tensor,
|
| 407 |
+
teacher_logits: torch.Tensor,
|
| 408 |
+
labels: torch.Tensor,
|
| 409 |
+
) -> torch.Tensor:
|
| 410 |
+
"""
|
| 411 |
+
Compute per-token weights based on teacher-student entropy differences.
|
| 412 |
+
|
| 413 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
student_logits: Student model logits [batch_size, seq_len, vocab_size]
|
| 417 |
+
teacher_logits: Teacher model logits [batch_size, seq_len, vocab_size]
|
| 418 |
+
labels: Ground truth labels [batch_size, seq_len]
|
| 419 |
+
|
| 420 |
+
Returns:
|
| 421 |
+
weights: Per-token weights [batch_size, seq_len]
|
| 422 |
+
"""
|
| 423 |
+
# Compute entropies
|
| 424 |
+
H_s = self.compute_entropy(student_logits) # [batch_size, seq_len]
|
| 425 |
+
H_t = self.compute_entropy(teacher_logits) # [batch_size, seq_len]
|
| 426 |
+
|
| 427 |
+
# Compute entropy difference
|
| 428 |
+
delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
|
| 429 |
+
|
| 430 |
+
# Component 1: exp(-H_t) - emphasizes positions where teacher is confident
|
| 431 |
+
teacher_confidence = torch.exp(-H_t)
|
| 432 |
+
|
| 433 |
+
# Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
|
| 434 |
+
disagreement_weight = torch.sigmoid(
|
| 435 |
+
self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Combined weight
|
| 439 |
+
weights = teacher_confidence * disagreement_weight
|
| 440 |
+
|
| 441 |
+
# Mask out positions where labels are -100 (padding or prompt tokens)
|
| 442 |
+
mask = (labels != -100).float()
|
| 443 |
+
weights = weights * mask
|
| 444 |
+
|
| 445 |
+
return weights
|
| 446 |
+
|
| 447 |
+
def compute_loss(
|
| 448 |
+
self,
|
| 449 |
+
model: nn.Module,
|
| 450 |
+
inputs: Dict[str, torch.Tensor | Any],
|
| 451 |
+
return_outputs: bool = False,
|
| 452 |
+
num_items_in_batch: torch.Tensor | None = None,
|
| 453 |
+
):
|
| 454 |
+
"""
|
| 455 |
+
Compute weighted loss with teacher-student entropy weighting.
|
| 456 |
+
|
| 457 |
+
This method:
|
| 458 |
+
1. Computes student model forward pass
|
| 459 |
+
2. If teacher available, computes teacher forward pass (no grad)
|
| 460 |
+
3. Calculates per-token weights based on entropy differences
|
| 461 |
+
4. Applies weights to the loss
|
| 462 |
+
"""
|
| 463 |
+
# Standard loss computation from parent class (no entropy weighting)
|
| 464 |
+
if not self.use_entropy_weighting or self.teacher_model is None:
|
| 465 |
+
return super().compute_loss(model, inputs, return_outputs, num_items_in_batch)
|
| 466 |
+
|
| 467 |
+
# Get labels - SFTTrainer should have already set this up
|
| 468 |
+
if "labels" not in inputs:
|
| 469 |
+
raise ValueError(
|
| 470 |
+
"Expected 'labels' in inputs but not found. This usually means your data collator "
|
| 471 |
+
"is not properly configured. Please ensure you're using the correct collator."
|
| 472 |
+
)
|
| 473 |
+
labels = inputs["labels"]
|
| 474 |
+
|
| 475 |
+
# ===== Entropy-weighted loss computation =====
|
| 476 |
+
|
| 477 |
+
# 1. Student forward pass (with gradient)
|
| 478 |
+
inputs["use_cache"] = False
|
| 479 |
+
outputs = model(**inputs)
|
| 480 |
+
student_logits = outputs.logits # [batch_size, seq_len, vocab_size]
|
| 481 |
+
|
| 482 |
+
# 2. Teacher forward pass (no gradient)
|
| 483 |
+
with torch.no_grad():
|
| 484 |
+
# Move inputs to teacher device for parallel computation
|
| 485 |
+
teacher_input_ids = inputs["input_ids"]
|
| 486 |
+
teacher_attention_mask = inputs.get("attention_mask", None)
|
| 487 |
+
|
| 488 |
+
# Get teacher's primary device
|
| 489 |
+
if isinstance(self.teacher_device, list):
|
| 490 |
+
primary_teacher_device = self.teacher_device[0]
|
| 491 |
+
else:
|
| 492 |
+
primary_teacher_device = self.teacher_device
|
| 493 |
+
|
| 494 |
+
# Move to teacher device if different from student
|
| 495 |
+
if str(primary_teacher_device) != str(teacher_input_ids.device):
|
| 496 |
+
teacher_input_ids = teacher_input_ids.to(primary_teacher_device)
|
| 497 |
+
if teacher_attention_mask is not None:
|
| 498 |
+
teacher_attention_mask = teacher_attention_mask.to(primary_teacher_device)
|
| 499 |
+
|
| 500 |
+
# Prepare teacher inputs - explicitly pass only what's needed
|
| 501 |
+
teacher_outputs = self.teacher_model(
|
| 502 |
+
input_ids=teacher_input_ids,
|
| 503 |
+
attention_mask=teacher_attention_mask,
|
| 504 |
+
use_cache=False, # Explicitly disable cache for training
|
| 505 |
+
)
|
| 506 |
+
teacher_logits_full = teacher_outputs.logits # [batch_size, seq_len, vocab_size] on teacher device
|
| 507 |
+
|
| 508 |
+
# CRITICAL OPTIMIZATION: Shift FIRST, then top-k
|
| 509 |
+
# This saves 1/seq_len of memory compared to top-k then shift
|
| 510 |
+
shift_teacher_logits_full = teacher_logits_full[..., :-1, :].contiguous()
|
| 511 |
+
del teacher_logits_full # Free full logits immediately
|
| 512 |
+
|
| 513 |
+
# Now extract top-k from shifted teacher logits
|
| 514 |
+
# Reduces from [B, T-1, 151k] to [B, T-1, 64] - 2400x reduction!
|
| 515 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 516 |
+
shift_teacher_logits_topk, _ = torch.topk(
|
| 517 |
+
shift_teacher_logits_full, k=top_k, dim=-1
|
| 518 |
+
) # [batch_size, seq_len-1, top_k] on teacher device
|
| 519 |
+
|
| 520 |
+
del shift_teacher_logits_full # Free shifted full logits
|
| 521 |
+
|
| 522 |
+
# Move teacher logits back to student device for weight computation
|
| 523 |
+
if str(primary_teacher_device) != str(student_logits.device):
|
| 524 |
+
shift_teacher_logits_topk = shift_teacher_logits_topk.to(student_logits.device)
|
| 525 |
+
|
| 526 |
+
# 3. Compute entropy-based weights
|
| 527 |
+
# Shift student logits and labels
|
| 528 |
+
shift_student_logits = student_logits[..., :-1, :].contiguous() # Full vocab for CE loss
|
| 529 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 530 |
+
|
| 531 |
+
# OPTIMIZATION 2: Extract top-k from student logits for entropy computation only
|
| 532 |
+
# Student still needs full logits for CE loss, but we can use top-k for entropy
|
| 533 |
+
with torch.no_grad():
|
| 534 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 535 |
+
student_logits_topk, student_indices_topk = torch.topk(
|
| 536 |
+
shift_student_logits, k=top_k, dim=-1
|
| 537 |
+
) # [batch_size, seq_len-1, top_k]
|
| 538 |
+
|
| 539 |
+
# Calculate weights using top-k versions (massive memory savings)
|
| 540 |
+
weights = self.compute_entropy_weights_topk(
|
| 541 |
+
student_logits_topk, # [B, T-1, k]
|
| 542 |
+
shift_teacher_logits_topk, # [B, T-1, k]
|
| 543 |
+
shift_labels,
|
| 544 |
+
) # [batch_size, seq_len-1]
|
| 545 |
+
|
| 546 |
+
# 4. Compute weighted cross-entropy loss
|
| 547 |
+
# Flatten for loss computation
|
| 548 |
+
shift_student_logits_flat = shift_student_logits.view(-1, shift_student_logits.size(-1))
|
| 549 |
+
shift_labels_flat = shift_labels.view(-1)
|
| 550 |
+
weights_flat = weights.view(-1)
|
| 551 |
+
|
| 552 |
+
# Compute per-token cross-entropy (no reduction)
|
| 553 |
+
loss_fct = nn.CrossEntropyLoss(reduction='none')
|
| 554 |
+
per_token_loss = loss_fct(shift_student_logits_flat, shift_labels_flat)
|
| 555 |
+
|
| 556 |
+
# Apply weights
|
| 557 |
+
weighted_loss = per_token_loss * weights_flat
|
| 558 |
+
|
| 559 |
+
# Create proper mask considering both labels and attention_mask
|
| 560 |
+
if "attention_mask" in inputs:
|
| 561 |
+
# Shift attention mask to align with shifted labels
|
| 562 |
+
shift_attention_mask = inputs["attention_mask"][..., 1:].contiguous()
|
| 563 |
+
valid_mask = (shift_labels != -100) & (shift_attention_mask == 1)
|
| 564 |
+
else:
|
| 565 |
+
# Fallback if no attention mask (e.g., padding-free training)
|
| 566 |
+
valid_mask = (shift_labels != -100)
|
| 567 |
+
|
| 568 |
+
valid_mask_flat = valid_mask.view(-1).float()
|
| 569 |
+
|
| 570 |
+
# Compute final loss: normalize by sum of weights (not by token count)
|
| 571 |
+
# CRITICAL: Normalizing by weight_sum instead of token count ensures that
|
| 572 |
+
# the relative importance defined by weights is actually respected.
|
| 573 |
+
# If we normalized by token count, small-weight tokens would still contribute
|
| 574 |
+
# equally in the denominator, defeating the purpose of weighting.
|
| 575 |
+
weighted_loss_masked = weighted_loss * valid_mask_flat
|
| 576 |
+
weight_sum = (weights_flat * valid_mask_flat).sum()
|
| 577 |
+
|
| 578 |
+
loss = weighted_loss_masked.sum() / weight_sum.clamp(min=1e-8)
|
| 579 |
+
|
| 580 |
+
# 5. Handle auxiliary loss if present (e.g., MoE)
|
| 581 |
+
if self.aux_loss_enabled and hasattr(outputs, 'aux_loss'):
|
| 582 |
+
loss = loss + outputs.aux_loss
|
| 583 |
+
|
| 584 |
+
# Log statistics (for monitoring) - simplified to avoid redundant entropy computation
|
| 585 |
+
if self.model.training:
|
| 586 |
+
with torch.no_grad():
|
| 587 |
+
# Only log weight statistics to avoid redundant entropy computation
|
| 588 |
+
# (entropy was already computed in compute_entropy_weights above)
|
| 589 |
+
weight_mean = weights[shift_labels != -100].mean().item()
|
| 590 |
+
self._metrics['train']['avg_weight'].append(weight_mean)
|
| 591 |
+
|
| 592 |
+
# Optional: Can enable these if you need entropy logging
|
| 593 |
+
# but note this will recompute entropy and use extra memory
|
| 594 |
+
# H_s_mean = self.compute_entropy(shift_student_logits).mean().item()
|
| 595 |
+
# H_t_mean = self.compute_entropy(shift_teacher_logits).mean().item()
|
| 596 |
+
# self._metrics['train']['student_entropy'].append(H_s_mean)
|
| 597 |
+
# self._metrics['train']['teacher_entropy'].append(H_t_mean)
|
| 598 |
+
|
| 599 |
+
return (loss, outputs) if return_outputs else loss
|
train/test_1.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Entropy-weighted Trainer (ETrainer) for knowledge distillation with dynamic loss weighting.
|
| 17 |
+
Based on TRL's SFTTrainer with modifications for teacher-student entropy-based weighting.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import torch
|
| 21 |
+
import torch.nn as nn
|
| 22 |
+
from typing import Any, Dict
|
| 23 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
+
from trl import SFTTrainer
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from trl import SFTConfig
|
| 27 |
+
import transformers
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ETrainerConfig(SFTConfig):
|
| 32 |
+
"""
|
| 33 |
+
Extended SFTConfig with teacher model parameters for entropy-weighted training.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
teacher_model_path (`str`, *optional*):
|
| 37 |
+
Path to the teacher model (e.g., "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B").
|
| 38 |
+
If None, no entropy weighting is applied.
|
| 39 |
+
entropy_weight_alpha (`float`, defaults to 2.0):
|
| 40 |
+
Alpha parameter controlling the sensitivity of entropy difference weighting.
|
| 41 |
+
Higher values make the weighting more sensitive to entropy differences.
|
| 42 |
+
entropy_weight_beta (`float`, defaults to 0.3):
|
| 43 |
+
Beta parameter controlling the zero-point offset for entropy difference.
|
| 44 |
+
Determines the threshold for significant entropy differences.
|
| 45 |
+
use_entropy_weighting (`bool`, defaults to True):
|
| 46 |
+
Whether to apply entropy-based loss weighting. Set to False to use standard loss.
|
| 47 |
+
teacher_dtype (`str`, defaults to "bfloat16"):
|
| 48 |
+
Data type for teacher model ("float16", "bfloat16", or "float32").
|
| 49 |
+
entropy_top_k (`int`, defaults to 64):
|
| 50 |
+
Number of top tokens to use for entropy approximation. Set to None or vocab_size
|
| 51 |
+
to compute exact entropy. Smaller values save memory significantly.
|
| 52 |
+
"""
|
| 53 |
+
teacher_model_path: str | None = field(
|
| 54 |
+
default=None,
|
| 55 |
+
metadata={"help": "Path to teacher model for entropy weighting"}
|
| 56 |
+
)
|
| 57 |
+
entropy_weight_alpha: float = field(
|
| 58 |
+
default=2.0,
|
| 59 |
+
metadata={"help": "Alpha parameter for entropy weighting sensitivity"}
|
| 60 |
+
)
|
| 61 |
+
entropy_weight_beta: float = field(
|
| 62 |
+
default=0.3,
|
| 63 |
+
metadata={"help": "Beta parameter for entropy weighting offset"}
|
| 64 |
+
)
|
| 65 |
+
use_entropy_weighting: bool = field(
|
| 66 |
+
default=True,
|
| 67 |
+
metadata={"help": "Whether to apply entropy-based loss weighting"}
|
| 68 |
+
)
|
| 69 |
+
teacher_dtype: str = field(
|
| 70 |
+
default="bfloat16",
|
| 71 |
+
metadata={"help": "Teacher model dtype (float16/bfloat16/float32)"}
|
| 72 |
+
)
|
| 73 |
+
entropy_top_k: int = field(
|
| 74 |
+
default=64,
|
| 75 |
+
metadata={"help": "Top-K tokens for entropy approximation (reduces memory)"}
|
| 76 |
+
)
|
| 77 |
+
teacher_device_ids: str | None = field(
|
| 78 |
+
default=None,
|
| 79 |
+
metadata={
|
| 80 |
+
"help": "GPU device IDs for teacher model (e.g., '0', '0,1', 'cuda:2'). "
|
| 81 |
+
"If None, uses same device as student. Separate GPUs enable parallel computation."
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class ETrainer(SFTTrainer):
|
| 87 |
+
"""
|
| 88 |
+
Entropy-weighted Trainer for knowledge distillation.
|
| 89 |
+
|
| 90 |
+
This trainer extends SFTTrainer by adding dynamic per-token loss weighting based on
|
| 91 |
+
teacher-student entropy differences. The weight formula is:
|
| 92 |
+
|
| 93 |
+
w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 94 |
+
|
| 95 |
+
Where:
|
| 96 |
+
- H_t(j): Teacher's entropy at position j (lower = more confident)
|
| 97 |
+
- H_s(j): Student's entropy at position j
|
| 98 |
+
- α: Sensitivity parameter (default 2.0)
|
| 99 |
+
- β: Offset parameter (default 0.3)
|
| 100 |
+
|
| 101 |
+
Key features:
|
| 102 |
+
- Higher weight when teacher is confident (low H_t)
|
| 103 |
+
- Higher weight when student differs significantly from teacher (large |ΔH|)
|
| 104 |
+
- Smooth weighting via sigmoid to avoid gradient explosion
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def __init__(self, *args, **kwargs):
|
| 108 |
+
# Extract ETrainer-specific arguments before passing to parent
|
| 109 |
+
args_obj = kwargs.get('args', None)
|
| 110 |
+
|
| 111 |
+
# Initialize parent SFTTrainer
|
| 112 |
+
super().__init__(*args, **kwargs)
|
| 113 |
+
|
| 114 |
+
# Load teacher model if specified
|
| 115 |
+
self.teacher_model = None
|
| 116 |
+
self.use_entropy_weighting = False
|
| 117 |
+
|
| 118 |
+
# Initialize entropy-specific metrics
|
| 119 |
+
# Note: Only initialize metrics we actually populate to avoid ZeroDivisionError
|
| 120 |
+
if not hasattr(self, '_metrics'):
|
| 121 |
+
self._metrics = {'train': {}, 'eval': {}}
|
| 122 |
+
|
| 123 |
+
# Add entropy-specific metric keys
|
| 124 |
+
for mode in ['train', 'eval']:
|
| 125 |
+
if mode not in self._metrics:
|
| 126 |
+
self._metrics[mode] = {}
|
| 127 |
+
# Only initialize avg_weight since we're not logging entropy to save memory
|
| 128 |
+
self._metrics[mode].setdefault('avg_weight', [])
|
| 129 |
+
|
| 130 |
+
if args_obj and hasattr(args_obj, 'teacher_model_path') and args_obj.teacher_model_path:
|
| 131 |
+
self.use_entropy_weighting = args_obj.use_entropy_weighting
|
| 132 |
+
self.entropy_weight_alpha = args_obj.entropy_weight_alpha
|
| 133 |
+
self.entropy_weight_beta = args_obj.entropy_weight_beta
|
| 134 |
+
self.entropy_top_k = getattr(args_obj, 'entropy_top_k', 64)
|
| 135 |
+
|
| 136 |
+
if self.use_entropy_weighting:
|
| 137 |
+
# Check if we're in distributed training
|
| 138 |
+
is_main_process = True
|
| 139 |
+
if torch.distributed.is_available() and torch.distributed.is_initialized():
|
| 140 |
+
is_main_process = torch.distributed.get_rank() == 0
|
| 141 |
+
|
| 142 |
+
if is_main_process:
|
| 143 |
+
print(f"🎓 Loading teacher model: {args_obj.teacher_model_path}")
|
| 144 |
+
print(f"📊 Entropy weighting params: α={self.entropy_weight_alpha}, β={self.entropy_weight_beta}")
|
| 145 |
+
print(f"💾 Entropy computation: top-k={self.entropy_top_k} (memory-efficient mode)")
|
| 146 |
+
|
| 147 |
+
# Load teacher model to specified device(s)
|
| 148 |
+
self.teacher_device = self._load_teacher_model(args_obj)
|
| 149 |
+
|
| 150 |
+
if is_main_process:
|
| 151 |
+
print("✅ Teacher model loaded and frozen")
|
| 152 |
+
|
| 153 |
+
# Freeze teacher model
|
| 154 |
+
for param in self.teacher_model.parameters():
|
| 155 |
+
param.requires_grad = False
|
| 156 |
+
|
| 157 |
+
if is_main_process:
|
| 158 |
+
print("✅ Teacher model loaded and frozen")
|
| 159 |
+
|
| 160 |
+
# --- Fix teacher tokenizer/model vocab mismatch ---
|
| 161 |
+
print("\n🔍 Checking teacher tokenizer/model alignment...")
|
| 162 |
+
|
| 163 |
+
teacher_tokenizer = AutoTokenizer.from_pretrained(
|
| 164 |
+
args_obj.teacher_model_path,
|
| 165 |
+
trust_remote_code=True
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
tokenizer_vocab = len(teacher_tokenizer)
|
| 169 |
+
model_vocab = self.teacher_model.config.vocab_size
|
| 170 |
+
|
| 171 |
+
if is_main_process:
|
| 172 |
+
print(f"📌 Teacher tokenizer vocab: {tokenizer_vocab}")
|
| 173 |
+
print(f"📌 Teacher model vocab: {model_vocab}")
|
| 174 |
+
|
| 175 |
+
if tokenizer_vocab != model_vocab:
|
| 176 |
+
if is_main_process:
|
| 177 |
+
print("⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...")
|
| 178 |
+
self.teacher_model.resize_token_embeddings(tokenizer_vocab)
|
| 179 |
+
self.teacher_model.config.vocab_size = tokenizer_vocab
|
| 180 |
+
if is_main_process:
|
| 181 |
+
print(f"✅ Teacher embeddings resized to {tokenizer_vocab}")
|
| 182 |
+
else:
|
| 183 |
+
if is_main_process:
|
| 184 |
+
print("✅ Teacher tokenizer and model vocab already match")
|
| 185 |
+
|
| 186 |
+
# --- Now align student with teacher ---
|
| 187 |
+
if is_main_process:
|
| 188 |
+
print(f"\n📊 Student model vocab size: {self.model.config.vocab_size}")
|
| 189 |
+
print(f"📊 Teacher model vocab size (after alignment): {self.teacher_model.config.vocab_size}")
|
| 190 |
+
|
| 191 |
+
# Handle vocab size mismatch by resizing student embeddings to match teacher
|
| 192 |
+
if self.teacher_model.config.vocab_size != self.model.config.vocab_size:
|
| 193 |
+
if is_main_process:
|
| 194 |
+
print(f"\n⚠️ Student/Teacher vocab size mismatch detected!")
|
| 195 |
+
print(f" Teacher: {self.teacher_model.config.vocab_size}")
|
| 196 |
+
print(f" Student: {self.model.config.vocab_size}")
|
| 197 |
+
print(f"🔧 Resizing student embeddings to match teacher...")
|
| 198 |
+
|
| 199 |
+
# Resize student model embeddings (new tokens initialized with mean of existing)
|
| 200 |
+
self.model.resize_token_embeddings(self.teacher_model.config.vocab_size)
|
| 201 |
+
|
| 202 |
+
if is_main_process:
|
| 203 |
+
print(f"✅ Student embeddings resized to {self.teacher_model.config.vocab_size}")
|
| 204 |
+
print(f" New student vocab size: {self.model.config.vocab_size}")
|
| 205 |
+
else:
|
| 206 |
+
if is_main_process:
|
| 207 |
+
print(f"✅ Student and teacher vocab sizes already match: {self.model.config.vocab_size}")
|
| 208 |
+
|
| 209 |
+
if is_main_process:
|
| 210 |
+
print("\n" + "="*60)
|
| 211 |
+
print(f"🎯 Final Vocab Alignment Complete")
|
| 212 |
+
print(f"📊 Teacher vocab size: {self.teacher_model.config.vocab_size}")
|
| 213 |
+
print(f"📊 Student vocab size: {self.model.config.vocab_size}")
|
| 214 |
+
print("="*60)
|
| 215 |
+
|
| 216 |
+
def _parse_teacher_devices(self, teacher_device_ids: str | None) -> str | list:
|
| 217 |
+
"""
|
| 218 |
+
Parse teacher device IDs string into device specification.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
teacher_device_ids: Device string like "0", "0,1", "cuda:2", or None
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Single device string (e.g., "cuda:0") or list of devices for multi-GPU
|
| 225 |
+
"""
|
| 226 |
+
if teacher_device_ids is None:
|
| 227 |
+
# Default: use cuda:0 if available
|
| 228 |
+
return "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 229 |
+
|
| 230 |
+
# Clean up the input
|
| 231 |
+
teacher_device_ids = teacher_device_ids.strip()
|
| 232 |
+
|
| 233 |
+
# Handle comma-separated multi-GPU case
|
| 234 |
+
if ',' in teacher_device_ids:
|
| 235 |
+
device_list = [f"cuda:{id.strip()}" if not id.strip().startswith('cuda:')
|
| 236 |
+
else id.strip()
|
| 237 |
+
for id in teacher_device_ids.split(',')]
|
| 238 |
+
return device_list
|
| 239 |
+
|
| 240 |
+
# Single device case
|
| 241 |
+
if not teacher_device_ids.startswith('cuda:'):
|
| 242 |
+
return f"cuda:{teacher_device_ids}"
|
| 243 |
+
return teacher_device_ids
|
| 244 |
+
|
| 245 |
+
def _load_teacher_model(self, args_obj):
|
| 246 |
+
"""
|
| 247 |
+
Load teacher model to specified device(s).
|
| 248 |
+
|
| 249 |
+
This method handles both single-GPU and multi-GPU teacher configurations.
|
| 250 |
+
"""
|
| 251 |
+
# Check if we're in distributed training
|
| 252 |
+
is_main_process = True
|
| 253 |
+
if torch.distributed.is_available() and torch.distributed.is_initialized():
|
| 254 |
+
is_main_process = torch.distributed.get_rank() == 0
|
| 255 |
+
|
| 256 |
+
# Parse device IDs: "1" / "0,1" / "cuda:3"
|
| 257 |
+
teacher_dev = getattr(args_obj, "teacher_device_ids", None)
|
| 258 |
+
teacher_dev = self._parse_teacher_devices(teacher_dev)
|
| 259 |
+
|
| 260 |
+
# Determine teacher dtype
|
| 261 |
+
dtype_map = {
|
| 262 |
+
"float16": torch.float16,
|
| 263 |
+
"bfloat16": torch.bfloat16,
|
| 264 |
+
"float32": torch.float32,
|
| 265 |
+
}
|
| 266 |
+
teacher_dtype = dtype_map.get(args_obj.teacher_dtype, torch.bfloat16)
|
| 267 |
+
|
| 268 |
+
if is_main_process:
|
| 269 |
+
print(f"🖥️ Teacher target device(s): {teacher_dev}")
|
| 270 |
+
|
| 271 |
+
# Case A: multi-GPU teacher
|
| 272 |
+
if isinstance(teacher_dev, list):
|
| 273 |
+
if is_main_process:
|
| 274 |
+
print(f"📡 Using multi-GPU teacher (HF auto parallel): {teacher_dev}")
|
| 275 |
+
|
| 276 |
+
# Let HuggingFace accelerate handle device distribution automatically
|
| 277 |
+
self.teacher_model = AutoModelForCausalLM.from_pretrained(
|
| 278 |
+
args_obj.teacher_model_path,
|
| 279 |
+
torch_dtype=teacher_dtype,
|
| 280 |
+
trust_remote_code=True,
|
| 281 |
+
device_map="auto", # Let accelerate decide optimal sharding
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Case B: single GPU teacher
|
| 285 |
+
else:
|
| 286 |
+
device_str = teacher_dev
|
| 287 |
+
if is_main_process:
|
| 288 |
+
print(f"📡 Loading teacher on single GPU: {device_str}")
|
| 289 |
+
|
| 290 |
+
# Load to CPU first, then move to target GPU
|
| 291 |
+
self.teacher_model = AutoModelForCausalLM.from_pretrained(
|
| 292 |
+
args_obj.teacher_model_path,
|
| 293 |
+
torch_dtype=teacher_dtype,
|
| 294 |
+
trust_remote_code=True,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Move whole model to target GPU
|
| 298 |
+
self.teacher_model.to(device_str)
|
| 299 |
+
|
| 300 |
+
# Freeze teacher model
|
| 301 |
+
self.teacher_model.eval()
|
| 302 |
+
for param in self.teacher_model.parameters():
|
| 303 |
+
param.requires_grad = False
|
| 304 |
+
|
| 305 |
+
return teacher_dev
|
| 306 |
+
|
| 307 |
+
def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
|
| 308 |
+
mode = "train" if self.model.training else "eval"
|
| 309 |
+
|
| 310 |
+
# Only rank 0 logs
|
| 311 |
+
if torch.distributed.is_available() and torch.distributed.is_initialized():
|
| 312 |
+
if torch.distributed.get_rank() != 0:
|
| 313 |
+
return
|
| 314 |
+
|
| 315 |
+
# Add avg_weight if exists
|
| 316 |
+
if self.use_entropy_weighting:
|
| 317 |
+
if 'avg_weight' in self._metrics[mode] and len(self._metrics[mode]['avg_weight']) > 0:
|
| 318 |
+
logs["avg_weight"] = sum(self._metrics[mode]['avg_weight']) / len(self._metrics[mode]['avg_weight'])
|
| 319 |
+
self._metrics[mode]['avg_weight'] = []
|
| 320 |
+
|
| 321 |
+
# VERY IMPORTANT:
|
| 322 |
+
# Call HF Trainer.log instead of TRL SFTTrainer.log
|
| 323 |
+
transformers.Trainer.log(self, logs)
|
| 324 |
+
|
| 325 |
+
def compute_entropy(self, logits: torch.Tensor, top_k: int = None) -> torch.Tensor:
|
| 326 |
+
"""
|
| 327 |
+
Compute per-token entropy from logits (supports top-k approximation for memory efficiency).
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
logits: Model logits of shape [batch_size, seq_len, vocab_size]
|
| 331 |
+
top_k: If not None and < vocab_size, compute entropy using only top-k tokens.
|
| 332 |
+
This significantly reduces memory usage with minimal impact on weight quality.
|
| 333 |
+
Default uses self.entropy_top_k (set to 64 by default).
|
| 334 |
+
|
| 335 |
+
Returns:
|
| 336 |
+
entropy: Per-token entropy of shape [batch_size, seq_len]
|
| 337 |
+
"""
|
| 338 |
+
# Use instance's top_k setting if not explicitly provided
|
| 339 |
+
if top_k is None:
|
| 340 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 341 |
+
|
| 342 |
+
with torch.no_grad():
|
| 343 |
+
# Use float32 for numerical stability
|
| 344 |
+
logits = logits.float()
|
| 345 |
+
|
| 346 |
+
vocab_size = logits.size(-1)
|
| 347 |
+
|
| 348 |
+
if top_k is not None and top_k < vocab_size:
|
| 349 |
+
# Top-K approximation: only compute entropy on top-k tokens
|
| 350 |
+
# This reduces memory from O(vocab_size) to O(k)
|
| 351 |
+
# Memory: ~151665 -> ~64, ~2400x reduction!
|
| 352 |
+
top_values, _ = torch.topk(logits, k=top_k, dim=-1) # [..., top_k]
|
| 353 |
+
|
| 354 |
+
log_probs = torch.log_softmax(top_values, dim=-1) # [..., top_k]
|
| 355 |
+
probs = torch.softmax(top_values, dim=-1) # [..., top_k]
|
| 356 |
+
|
| 357 |
+
entropy = -(probs * log_probs).sum(dim=-1) # [batch_size, seq_len]
|
| 358 |
+
else:
|
| 359 |
+
# Exact entropy computation (uses full vocab)
|
| 360 |
+
log_probs = torch.log_softmax(logits, dim=-1)
|
| 361 |
+
probs = torch.softmax(logits, dim=-1)
|
| 362 |
+
entropy = -(probs * log_probs).sum(dim=-1)
|
| 363 |
+
|
| 364 |
+
return entropy
|
| 365 |
+
|
| 366 |
+
def compute_entropy_topk(self, logits_topk: torch.Tensor) -> torch.Tensor:
|
| 367 |
+
"""
|
| 368 |
+
Compute entropy from pre-extracted top-k logits (ultra memory-efficient).
|
| 369 |
+
|
| 370 |
+
Args:
|
| 371 |
+
logits_topk: Top-k logits of shape [batch_size, seq_len, k]
|
| 372 |
+
|
| 373 |
+
Returns:
|
| 374 |
+
entropy: Per-token entropy of shape [batch_size, seq_len]
|
| 375 |
+
"""
|
| 376 |
+
with torch.no_grad():
|
| 377 |
+
# logits_topk is already top-k, just compute softmax on it
|
| 378 |
+
logits_topk = logits_topk.float()
|
| 379 |
+
|
| 380 |
+
log_probs = torch.log_softmax(logits_topk, dim=-1)
|
| 381 |
+
probs = torch.softmax(logits_topk, dim=-1)
|
| 382 |
+
|
| 383 |
+
entropy = -(probs * log_probs).sum(dim=-1)
|
| 384 |
+
|
| 385 |
+
return entropy
|
| 386 |
+
|
| 387 |
+
def compute_entropy_weights_topk(
|
| 388 |
+
self,
|
| 389 |
+
student_logits_topk: torch.Tensor,
|
| 390 |
+
teacher_logits_topk: torch.Tensor,
|
| 391 |
+
labels: torch.Tensor,
|
| 392 |
+
) -> torch.Tensor:
|
| 393 |
+
"""
|
| 394 |
+
Compute per-token weights using pre-extracted top-k logits (optimized version).
|
| 395 |
+
|
| 396 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 397 |
+
|
| 398 |
+
Args:
|
| 399 |
+
student_logits_topk: Student top-k logits [batch_size, seq_len, k]
|
| 400 |
+
teacher_logits_topk: Teacher top-k logits [batch_size, seq_len, k]
|
| 401 |
+
labels: Ground truth labels [batch_size, seq_len]
|
| 402 |
+
|
| 403 |
+
Returns:
|
| 404 |
+
weights: Per-token weights [batch_size, seq_len]
|
| 405 |
+
"""
|
| 406 |
+
# Compute entropies from top-k logits (no additional memory allocation)
|
| 407 |
+
H_s = self.compute_entropy_topk(student_logits_topk) # [batch_size, seq_len]
|
| 408 |
+
H_t = self.compute_entropy_topk(teacher_logits_topk) # [batch_size, seq_len]
|
| 409 |
+
|
| 410 |
+
# Compute entropy difference
|
| 411 |
+
delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
|
| 412 |
+
|
| 413 |
+
# Component 1: exp(-H_t) - emphasizes positions where teacher is confident
|
| 414 |
+
teacher_confidence = torch.exp(-H_t)
|
| 415 |
+
|
| 416 |
+
# Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
|
| 417 |
+
disagreement_weight = torch.nn.functional.softplus(
|
| 418 |
+
self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
# Combined weight
|
| 422 |
+
weights = teacher_confidence * disagreement_weight
|
| 423 |
+
|
| 424 |
+
# Mask out positions where labels are -100 (padding or prompt tokens)
|
| 425 |
+
mask = (labels != -100).float()
|
| 426 |
+
weights = weights * mask
|
| 427 |
+
|
| 428 |
+
return weights
|
| 429 |
+
|
| 430 |
+
def compute_entropy_weights(
|
| 431 |
+
self,
|
| 432 |
+
student_logits: torch.Tensor,
|
| 433 |
+
teacher_logits: torch.Tensor,
|
| 434 |
+
labels: torch.Tensor,
|
| 435 |
+
) -> torch.Tensor:
|
| 436 |
+
"""
|
| 437 |
+
Compute per-token weights based on teacher-student entropy differences.
|
| 438 |
+
|
| 439 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 440 |
+
|
| 441 |
+
Args:
|
| 442 |
+
student_logits: Student model logits [batch_size, seq_len, vocab_size]
|
| 443 |
+
teacher_logits: Teacher model logits [batch_size, seq_len, vocab_size]
|
| 444 |
+
labels: Ground truth labels [batch_size, seq_len]
|
| 445 |
+
|
| 446 |
+
Returns:
|
| 447 |
+
weights: Per-token weights [batch_size, seq_len]
|
| 448 |
+
"""
|
| 449 |
+
# Compute entropies
|
| 450 |
+
H_s = self.compute_entropy(student_logits) # [batch_size, seq_len]
|
| 451 |
+
H_t = self.compute_entropy(teacher_logits) # [batch_size, seq_len]
|
| 452 |
+
|
| 453 |
+
# Compute entropy difference
|
| 454 |
+
delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
|
| 455 |
+
|
| 456 |
+
# Component 1: exp(-H_t) - emphasizes positions where teacher is confident
|
| 457 |
+
teacher_confidence = torch.exp(-H_t)
|
| 458 |
+
|
| 459 |
+
# Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
|
| 460 |
+
disagreement_weight = torch.sigmoid(
|
| 461 |
+
self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
# Combined weight
|
| 465 |
+
weights = teacher_confidence * disagreement_weight
|
| 466 |
+
|
| 467 |
+
# Mask out positions where labels are -100 (padding or prompt tokens)
|
| 468 |
+
mask = (labels != -100).float()
|
| 469 |
+
weights = weights * mask
|
| 470 |
+
|
| 471 |
+
return weights
|
| 472 |
+
|
| 473 |
+
def compute_loss(
|
| 474 |
+
self,
|
| 475 |
+
model: nn.Module,
|
| 476 |
+
inputs: Dict[str, torch.Tensor | Any],
|
| 477 |
+
return_outputs: bool = False,
|
| 478 |
+
num_items_in_batch: torch.Tensor | None = None,
|
| 479 |
+
):
|
| 480 |
+
"""
|
| 481 |
+
Compute weighted loss with teacher-student entropy weighting.
|
| 482 |
+
|
| 483 |
+
This method:
|
| 484 |
+
1. Computes student model forward pass
|
| 485 |
+
2. If teacher available, computes teacher forward pass (no grad)
|
| 486 |
+
3. Calculates per-token weights based on entropy differences
|
| 487 |
+
4. Applies weights to the loss
|
| 488 |
+
"""
|
| 489 |
+
# Standard loss computation from parent class (no entropy weighting)
|
| 490 |
+
if not self.use_entropy_weighting or self.teacher_model is None:
|
| 491 |
+
return super().compute_loss(model, inputs, return_outputs, num_items_in_batch)
|
| 492 |
+
|
| 493 |
+
# Get labels - SFTTrainer should have already set this up
|
| 494 |
+
if "labels" not in inputs:
|
| 495 |
+
raise ValueError(
|
| 496 |
+
"Expected 'labels' in inputs but not found. This usually means your data collator "
|
| 497 |
+
"is not properly configured. Please ensure you're using the correct collator."
|
| 498 |
+
)
|
| 499 |
+
labels = inputs["labels"]
|
| 500 |
+
|
| 501 |
+
# ===== Entropy-weighted loss computation with PARALLEL execution =====
|
| 502 |
+
|
| 503 |
+
# Get teacher's primary device
|
| 504 |
+
if isinstance(self.teacher_device, list):
|
| 505 |
+
primary_teacher_device = self.teacher_device[0]
|
| 506 |
+
else:
|
| 507 |
+
primary_teacher_device = self.teacher_device
|
| 508 |
+
|
| 509 |
+
# Prepare teacher inputs (move to teacher device early)
|
| 510 |
+
teacher_input_ids = inputs["input_ids"].to(primary_teacher_device, non_blocking=True)
|
| 511 |
+
teacher_attention_mask = inputs.get("attention_mask", None)
|
| 512 |
+
if teacher_attention_mask is not None:
|
| 513 |
+
teacher_attention_mask = teacher_attention_mask.to(primary_teacher_device, non_blocking=True)
|
| 514 |
+
|
| 515 |
+
# Get student device
|
| 516 |
+
student_device = next(model.parameters()).device
|
| 517 |
+
|
| 518 |
+
# Create CUDA streams for parallel execution
|
| 519 |
+
student_stream = torch.cuda.Stream(device=student_device)
|
| 520 |
+
teacher_stream = torch.cuda.Stream(device=primary_teacher_device)
|
| 521 |
+
|
| 522 |
+
# Containers for outputs
|
| 523 |
+
student_outputs_container = [None]
|
| 524 |
+
teacher_logits_topk_container = [None]
|
| 525 |
+
|
| 526 |
+
# ===== PARALLEL EXECUTION BLOCK =====
|
| 527 |
+
|
| 528 |
+
# Containers for student top-k logits
|
| 529 |
+
student_logits_topk_container = [None]
|
| 530 |
+
shift_student_logits_container = [None]
|
| 531 |
+
|
| 532 |
+
# 1. Student forward pass (with gradient) on student stream
|
| 533 |
+
with torch.cuda.stream(student_stream):
|
| 534 |
+
inputs["use_cache"] = False
|
| 535 |
+
student_outputs_container[0] = model(**inputs)
|
| 536 |
+
|
| 537 |
+
# CRITICAL: Extract student logits and do shift + top-k in the same stream
|
| 538 |
+
# This avoids keeping full vocab logits in memory
|
| 539 |
+
student_logits_full = student_outputs_container[0].logits # [B, T, V]
|
| 540 |
+
|
| 541 |
+
# Shift student logits for loss computation (needs full vocab for CE)
|
| 542 |
+
shift_student_logits_container[0] = student_logits_full[..., :-1, :].contiguous()
|
| 543 |
+
|
| 544 |
+
# Extract top-k from shifted student logits for entropy (memory efficient)
|
| 545 |
+
with torch.no_grad():
|
| 546 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 547 |
+
student_logits_topk_container[0], _ = torch.topk(
|
| 548 |
+
shift_student_logits_container[0], k=top_k, dim=-1
|
| 549 |
+
) # [B, T-1, k]
|
| 550 |
+
|
| 551 |
+
# 2. Teacher forward pass (no gradient) on teacher stream - RUNS IN PARALLEL!
|
| 552 |
+
with torch.cuda.stream(teacher_stream):
|
| 553 |
+
with torch.no_grad():
|
| 554 |
+
# Prepare teacher inputs - explicitly pass only what's needed
|
| 555 |
+
teacher_outputs = self.teacher_model(
|
| 556 |
+
input_ids=teacher_input_ids,
|
| 557 |
+
attention_mask=teacher_attention_mask,
|
| 558 |
+
use_cache=False, # Explicitly disable cache for training
|
| 559 |
+
)
|
| 560 |
+
teacher_logits_full = teacher_outputs.logits # [batch_size, seq_len, vocab_size] on teacher device
|
| 561 |
+
|
| 562 |
+
# CRITICAL MEMORY OPTIMIZATION: Extract top-k BEFORE shift to save memory
|
| 563 |
+
# This avoids creating a full [B, T, V] contiguous tensor
|
| 564 |
+
top_k = getattr(self, 'entropy_top_k', 64)
|
| 565 |
+
|
| 566 |
+
# Get top-k from full logits [B, T, V] -> [B, T, k]
|
| 567 |
+
teacher_logits_topk_full, _ = torch.topk(teacher_logits_full, k=top_k, dim=-1)
|
| 568 |
+
|
| 569 |
+
# Immediately delete full logits (free 4+ GB!)
|
| 570 |
+
del teacher_logits_full
|
| 571 |
+
del teacher_outputs # Also free the outputs object
|
| 572 |
+
|
| 573 |
+
# Now shift the much smaller top-k tensor [B, T, k] -> [B, T-1, k]
|
| 574 |
+
shift_teacher_logits_topk = teacher_logits_topk_full[..., :-1, :].contiguous()
|
| 575 |
+
|
| 576 |
+
# Free the unshifted top-k
|
| 577 |
+
del teacher_logits_topk_full
|
| 578 |
+
|
| 579 |
+
# Move to student device (asynchronously)
|
| 580 |
+
teacher_logits_topk_container[0] = shift_teacher_logits_topk.to(
|
| 581 |
+
student_device, non_blocking=True
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
# Explicitly delete to free teacher device memory immediately
|
| 585 |
+
del shift_teacher_logits_topk
|
| 586 |
+
|
| 587 |
+
# ===== SYNCHRONIZATION POINT =====
|
| 588 |
+
# Wait for both streams to complete before using their results
|
| 589 |
+
student_stream.synchronize()
|
| 590 |
+
teacher_stream.synchronize()
|
| 591 |
+
|
| 592 |
+
# Extract results from containers
|
| 593 |
+
outputs = student_outputs_container[0]
|
| 594 |
+
shift_teacher_logits_topk = teacher_logits_topk_container[0]
|
| 595 |
+
shift_student_logits = shift_student_logits_container[0]
|
| 596 |
+
student_logits_topk = student_logits_topk_container[0]
|
| 597 |
+
|
| 598 |
+
# 3. Compute entropy-based weights
|
| 599 |
+
# Shift labels
|
| 600 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 601 |
+
|
| 602 |
+
# Calculate weights using top-k versions (massive memory savings)
|
| 603 |
+
weights = self.compute_entropy_weights_topk(
|
| 604 |
+
student_logits_topk, # [B, T-1, k]
|
| 605 |
+
shift_teacher_logits_topk, # [B, T-1, k]
|
| 606 |
+
shift_labels,
|
| 607 |
+
) # [batch_size, seq_len-1]
|
| 608 |
+
|
| 609 |
+
# 4. Compute weighted cross-entropy loss
|
| 610 |
+
# Flatten for loss computation
|
| 611 |
+
shift_student_logits_flat = shift_student_logits.view(-1, shift_student_logits.size(-1))
|
| 612 |
+
shift_labels_flat = shift_labels.view(-1)
|
| 613 |
+
weights_flat = weights.view(-1)
|
| 614 |
+
|
| 615 |
+
# Compute per-token cross-entropy (no reduction)
|
| 616 |
+
loss_fct = nn.CrossEntropyLoss(reduction='none')
|
| 617 |
+
per_token_loss = loss_fct(shift_student_logits_flat, shift_labels_flat)
|
| 618 |
+
|
| 619 |
+
# Apply weights
|
| 620 |
+
weighted_loss = per_token_loss * weights_flat
|
| 621 |
+
|
| 622 |
+
# Create proper mask considering both labels and attention_mask
|
| 623 |
+
if "attention_mask" in inputs:
|
| 624 |
+
# Shift attention mask to align with shifted labels
|
| 625 |
+
shift_attention_mask = inputs["attention_mask"][..., 1:].contiguous()
|
| 626 |
+
valid_mask = (shift_labels != -100) & (shift_attention_mask == 1)
|
| 627 |
+
else:
|
| 628 |
+
# Fallback if no attention mask (e.g., padding-free training)
|
| 629 |
+
valid_mask = (shift_labels != -100)
|
| 630 |
+
|
| 631 |
+
valid_mask_flat = valid_mask.view(-1).float()
|
| 632 |
+
|
| 633 |
+
# Compute final loss: normalize by sum of weights (not by token count)
|
| 634 |
+
# CRITICAL: Normalizing by weight_sum instead of token count ensures that
|
| 635 |
+
# the relative importance defined by weights is actually respected.
|
| 636 |
+
# If we normalized by token count, small-weight tokens would still contribute
|
| 637 |
+
# equally in the denominator, defeating the purpose of weighting.
|
| 638 |
+
weighted_loss_masked = weighted_loss * valid_mask_flat
|
| 639 |
+
weight_sum = (weights_flat * valid_mask_flat).sum()
|
| 640 |
+
|
| 641 |
+
loss = weighted_loss_masked.sum() / weight_sum.clamp(min=1e-8)
|
| 642 |
+
|
| 643 |
+
# 5. Handle auxiliary loss if present (e.g., MoE)
|
| 644 |
+
if self.aux_loss_enabled and hasattr(outputs, 'aux_loss'):
|
| 645 |
+
loss = loss + outputs.aux_loss
|
| 646 |
+
|
| 647 |
+
# Log statistics (for monitoring) - simplified to avoid redundant entropy computation
|
| 648 |
+
if self.model.training:
|
| 649 |
+
with torch.no_grad():
|
| 650 |
+
# Only log weight statistics to avoid redundant entropy computation
|
| 651 |
+
# (entropy was already computed in compute_entropy_weights above)
|
| 652 |
+
weight_mean = weights[shift_labels != -100].mean().item()
|
| 653 |
+
self._metrics['train']['avg_weight'].append(weight_mean)
|
| 654 |
+
|
| 655 |
+
# Optional: Can enable these if you need entropy logging
|
| 656 |
+
# but note this will recompute entropy and use extra memory
|
| 657 |
+
# H_s_mean = self.compute_entropy(shift_student_logits).mean().item()
|
| 658 |
+
# H_t_mean = self.compute_entropy(shift_teacher_logits).mean().item()
|
| 659 |
+
# self._metrics['train']['student_entropy'].append(H_s_mean)
|
| 660 |
+
# self._metrics['train']['teacher_entropy'].append(H_t_mean)
|
| 661 |
+
|
| 662 |
+
return (loss, outputs) if return_outputs else loss
|
train/test_on_math.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
vLLM evaluation with correct parameter name: max_model_length
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import shutil
|
| 7 |
+
from datetime import timedelta
|
| 8 |
+
|
| 9 |
+
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
| 10 |
+
from lighteval.models.vllm.vllm_model import VLLMModelConfig
|
| 11 |
+
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
| 12 |
+
from lighteval.utils.imports import is_package_available
|
| 13 |
+
|
| 14 |
+
if is_package_available("accelerate"):
|
| 15 |
+
from accelerate import Accelerator, InitProcessGroupKwargs
|
| 16 |
+
accelerator = Accelerator(
|
| 17 |
+
kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]
|
| 18 |
+
)
|
| 19 |
+
else:
|
| 20 |
+
accelerator = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def setup_environment():
|
| 24 |
+
cc = shutil.which("x86_64-conda-linux-gnu-cc") or shutil.which("gcc")
|
| 25 |
+
cxx = shutil.which("x86_64-conda-linux-gnu-c++") or shutil.which("g++")
|
| 26 |
+
if cc:
|
| 27 |
+
os.environ["CC"] = cc
|
| 28 |
+
if cxx:
|
| 29 |
+
os.environ["CXX"] = cxx
|
| 30 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 31 |
+
print("✓ Environment configured")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main():
|
| 35 |
+
setup_environment()
|
| 36 |
+
print("--- Starting vLLM evaluation ---")
|
| 37 |
+
|
| 38 |
+
# Use max_model_length (not max_model_len!)
|
| 39 |
+
MAX_MODEL_LENGTH = 40960 # 40k tokens to handle 33658
|
| 40 |
+
|
| 41 |
+
pipeline = Pipeline(
|
| 42 |
+
tasks="lighteval|math_500|0|1",
|
| 43 |
+
pipeline_parameters=PipelineParameters(
|
| 44 |
+
launcher_type=ParallelismManager.ACCELERATE if accelerator else ParallelismManager.NONE,
|
| 45 |
+
# max_samples=10, # For testing
|
| 46 |
+
),
|
| 47 |
+
evaluation_tracker=EvaluationTracker(
|
| 48 |
+
output_dir="./results",
|
| 49 |
+
save_details=True,
|
| 50 |
+
),
|
| 51 |
+
model_config=VLLMModelConfig(
|
| 52 |
+
model_name="Qwen/Qwen2.5-Math-7B-Instruct",
|
| 53 |
+
dtype="bfloat16",
|
| 54 |
+
tensor_parallel_size=1,
|
| 55 |
+
gpu_memory_utilization=0.92,
|
| 56 |
+
trust_remote_code=True,
|
| 57 |
+
max_model_length=MAX_MODEL_LENGTH, # ✅ Correct parameter name!
|
| 58 |
+
),
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
print(f"📊 Config: max_model_length={MAX_MODEL_LENGTH}")
|
| 62 |
+
|
| 63 |
+
pipeline.evaluate()
|
| 64 |
+
pipeline.save_and_push_results()
|
| 65 |
+
pipeline.show_results()
|
| 66 |
+
|
| 67 |
+
print("✅ Done!")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
main()
|
train/test_ood_python_lora_rope.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import lighteval
|
| 2 |
+
from lighteval.logging.evaluation_tracker import EvaluationTracker
|
| 3 |
+
from lighteval.models.vllm.vllm_model import VLLMModelConfig
|
| 4 |
+
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
|
| 5 |
+
from lighteval.utils.imports import is_package_available
|
| 6 |
+
from peft import PeftModel
|
| 7 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
| 8 |
+
import os
|
| 9 |
+
import torch
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
if is_package_available("accelerate"):
|
| 13 |
+
from datetime import timedelta
|
| 14 |
+
from accelerate import Accelerator, InitProcessGroupKwargs
|
| 15 |
+
accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
|
| 16 |
+
else:
|
| 17 |
+
accelerator = None
|
| 18 |
+
|
| 19 |
+
def merge_lora_if_needed():
|
| 20 |
+
"""Merge LoRA model and preserve RoPE scaling configuration"""
|
| 21 |
+
merged_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora-Merged"
|
| 22 |
+
|
| 23 |
+
if os.path.exists(os.path.join(merged_path, "config.json")):
|
| 24 |
+
print(f"Merged model already exists at {merged_path}")
|
| 25 |
+
|
| 26 |
+
# Verify RoPE scaling in existing merged model
|
| 27 |
+
config_path = os.path.join(merged_path, "config.json")
|
| 28 |
+
with open(config_path, 'r') as f:
|
| 29 |
+
config = json.load(f)
|
| 30 |
+
if 'rope_scaling' in config:
|
| 31 |
+
print(f"✓ Existing merged model has RoPE scaling: {config['rope_scaling']}")
|
| 32 |
+
print(f"✓ Max position embeddings: {config.get('max_position_embeddings', 'N/A')}")
|
| 33 |
+
else:
|
| 34 |
+
print("⚠ Warning: Existing merged model does NOT have RoPE scaling config!")
|
| 35 |
+
print(" Deleting and re-creating with RoPE scaling...")
|
| 36 |
+
import shutil
|
| 37 |
+
shutil.rmtree(merged_path)
|
| 38 |
+
return merge_lora_if_needed() # Recursive call to re-create
|
| 39 |
+
|
| 40 |
+
return merged_path
|
| 41 |
+
|
| 42 |
+
print("="*100)
|
| 43 |
+
print("Merged model not found. Starting merge process...")
|
| 44 |
+
print("="*100)
|
| 45 |
+
|
| 46 |
+
lora_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora"
|
| 47 |
+
|
| 48 |
+
# Step 1: Load base model
|
| 49 |
+
print("\n[1/5] Loading base model...")
|
| 50 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
+
"Qwen/Qwen2.5-Math-1.5B",
|
| 52 |
+
torch_dtype=torch.bfloat16,
|
| 53 |
+
trust_remote_code=True,
|
| 54 |
+
device_map="auto"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Step 2: Load LoRA adapter
|
| 58 |
+
print("\n[2/5] Loading LoRA adapter...")
|
| 59 |
+
model = PeftModel.from_pretrained(base_model, lora_path)
|
| 60 |
+
|
| 61 |
+
# Step 3: Merge and unload
|
| 62 |
+
print("\n[3/5] Merging LoRA weights with base model...")
|
| 63 |
+
merged_model = model.merge_and_unload()
|
| 64 |
+
|
| 65 |
+
# Step 4: Save merged model
|
| 66 |
+
print(f"\n[4/5] Saving merged model to {merged_path}...")
|
| 67 |
+
os.makedirs(merged_path, exist_ok=True)
|
| 68 |
+
merged_model.save_pretrained(merged_path, safe_serialization=True)
|
| 69 |
+
|
| 70 |
+
# Step 5: Add RoPE scaling configuration
|
| 71 |
+
print("\n[5/5] Adding RoPE scaling configuration...")
|
| 72 |
+
merged_config_path = os.path.join(merged_path, "config.json")
|
| 73 |
+
with open(merged_config_path, 'r') as f:
|
| 74 |
+
merged_config = json.load(f)
|
| 75 |
+
|
| 76 |
+
# ========== RoPE scaling: 4096 -> 8192, factor = 2.0 ==========
|
| 77 |
+
merged_config['rope_scaling'] = {
|
| 78 |
+
"type": "linear",
|
| 79 |
+
"factor": 2.0
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
print(f"✓ Added RoPE scaling: {merged_config['rope_scaling']}")
|
| 83 |
+
|
| 84 |
+
# 更新 max_position_embeddings 从 4096 到 8192
|
| 85 |
+
original_max_pos = merged_config.get('max_position_embeddings', 4096)
|
| 86 |
+
scaling_factor = merged_config['rope_scaling']['factor']
|
| 87 |
+
new_max_pos = int(original_max_pos * scaling_factor)
|
| 88 |
+
merged_config['max_position_embeddings'] = new_max_pos
|
| 89 |
+
print(f"✓ Updated max_position_embeddings: {original_max_pos} -> {new_max_pos}")
|
| 90 |
+
|
| 91 |
+
# Save updated config
|
| 92 |
+
with open(merged_config_path, 'w') as f:
|
| 93 |
+
json.dump(merged_config, f, indent=2, ensure_ascii=False)
|
| 94 |
+
|
| 95 |
+
# Save tokenizer
|
| 96 |
+
print("Saving tokenizer...")
|
| 97 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B", trust_remote_code=True)
|
| 98 |
+
tokenizer.save_pretrained(merged_path)
|
| 99 |
+
|
| 100 |
+
# Clean up memory
|
| 101 |
+
del base_model
|
| 102 |
+
del model
|
| 103 |
+
del merged_model
|
| 104 |
+
torch.cuda.empty_cache()
|
| 105 |
+
|
| 106 |
+
print("\n" + "="*100)
|
| 107 |
+
print("✓ Merge completed successfully!")
|
| 108 |
+
print(f"✓ Merged model saved to: {merged_path}")
|
| 109 |
+
print(f"✓ RoPE scaling config: {merged_config['rope_scaling']}")
|
| 110 |
+
print(f"✓ Max position embeddings: {merged_config['max_position_embeddings']}")
|
| 111 |
+
print("="*100 + "\n")
|
| 112 |
+
|
| 113 |
+
return merged_path
|
| 114 |
+
|
| 115 |
+
def main():
|
| 116 |
+
# Set CUDA device FIRST before any CUDA operations
|
| 117 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
|
| 118 |
+
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
| 119 |
+
|
| 120 |
+
print("Checking for merged model...")
|
| 121 |
+
merged_model_path = merge_lora_if_needed()
|
| 122 |
+
|
| 123 |
+
# Detect number of GPUs
|
| 124 |
+
num_gpus = torch.cuda.device_count()
|
| 125 |
+
print(f"\n{'='*100}")
|
| 126 |
+
print(f"Detected {num_gpus} GPU(s)")
|
| 127 |
+
if num_gpus > 0:
|
| 128 |
+
for i in range(num_gpus):
|
| 129 |
+
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
|
| 130 |
+
print(f"{'='*100}\n")
|
| 131 |
+
|
| 132 |
+
# Read the merged model config to get max_model_length
|
| 133 |
+
config_path = os.path.join(merged_model_path, "config.json")
|
| 134 |
+
with open(config_path, 'r') as f:
|
| 135 |
+
model_config_dict = json.load(f)
|
| 136 |
+
max_position_embeddings = model_config_dict.get('max_position_embeddings', 4096)
|
| 137 |
+
rope_scaling = model_config_dict.get('rope_scaling', None)
|
| 138 |
+
|
| 139 |
+
print(f"Model max_position_embeddings: {max_position_embeddings}")
|
| 140 |
+
print(f"Model RoPE scaling config: {rope_scaling}")
|
| 141 |
+
|
| 142 |
+
# 使用 8192 作为 max_model_length(你训练时扩展后的长度)
|
| 143 |
+
max_model_length = 8192
|
| 144 |
+
print(f"Using max_model_length: {max_model_length}\n")
|
| 145 |
+
|
| 146 |
+
print("Setting up evaluation pipeline...")
|
| 147 |
+
|
| 148 |
+
evaluation_tracker = EvaluationTracker(
|
| 149 |
+
output_dir="./results",
|
| 150 |
+
save_details=True,
|
| 151 |
+
push_to_hub=False,
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
pipeline_params = PipelineParameters(
|
| 155 |
+
launcher_type=ParallelismManager.ACCELERATE,
|
| 156 |
+
custom_tasks_directory=None,
|
| 157 |
+
max_samples=500
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
model_config = VLLMModelConfig(
|
| 161 |
+
model_name=merged_model_path,
|
| 162 |
+
dtype="bfloat16",
|
| 163 |
+
max_model_length=max_model_length, # 使用 8192
|
| 164 |
+
trust_remote_code=True,
|
| 165 |
+
tensor_parallel_size=num_gpus,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
task = "lighteval|math_500|0"
|
| 169 |
+
|
| 170 |
+
print(f"Using {num_gpus} GPU(s) with tensor parallelism")
|
| 171 |
+
print(f"Task: {task}")
|
| 172 |
+
print(f"Max model length: {max_model_length}\n")
|
| 173 |
+
|
| 174 |
+
print("Creating pipeline...")
|
| 175 |
+
pipeline = Pipeline(
|
| 176 |
+
tasks=task,
|
| 177 |
+
pipeline_parameters=pipeline_params,
|
| 178 |
+
evaluation_tracker=evaluation_tracker,
|
| 179 |
+
model_config=model_config,
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Fix generation_size
|
| 183 |
+
print("Configuring generation parameters...")
|
| 184 |
+
for task_name, task_obj in pipeline.tasks_dict.items():
|
| 185 |
+
for doc in task_obj._docs:
|
| 186 |
+
doc.generation_size = 2048
|
| 187 |
+
|
| 188 |
+
print("\nStarting evaluation...")
|
| 189 |
+
print("="*100)
|
| 190 |
+
pipeline.evaluate()
|
| 191 |
+
|
| 192 |
+
print("\nSaving results...")
|
| 193 |
+
pipeline.save_and_push_results()
|
| 194 |
+
|
| 195 |
+
print("\nShowing results...")
|
| 196 |
+
pipeline.show_results()
|
| 197 |
+
|
| 198 |
+
print("\n" + "="*100)
|
| 199 |
+
print("✓ Evaluation completed!")
|
| 200 |
+
print("="*100)
|
| 201 |
+
|
| 202 |
+
if __name__ == "__main__":
|
| 203 |
+
main()
|
train/train_qwen_verl_46k.py
ADDED
|
File without changes
|
train/train_qwen_verl_46k.sh
ADDED
|
File without changes
|
train/wandb/run-20251113_165350-n56lk6p0/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-13T16:53:50.599141389+08:00","level":"INFO","msg":"stream: starting","core version":"0.22.3"}
|
| 2 |
+
{"time":"2025-11-13T16:53:51.898899791+08:00","level":"INFO","msg":"stream: created new stream","id":"n56lk6p0"}
|
| 3 |
+
{"time":"2025-11-13T16:53:51.899200939+08:00","level":"INFO","msg":"handler: started","stream_id":"n56lk6p0"}
|
| 4 |
+
{"time":"2025-11-13T16:53:51.900823773+08:00","level":"INFO","msg":"stream: started","id":"n56lk6p0"}
|
| 5 |
+
{"time":"2025-11-13T16:53:51.90140837+08:00","level":"INFO","msg":"writer: started","stream_id":"n56lk6p0"}
|
| 6 |
+
{"time":"2025-11-13T16:53:51.901489184+08:00","level":"INFO","msg":"sender: started","stream_id":"n56lk6p0"}
|
train/wandb/run-20251113_171624-kgxigylp/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
|
| 3 |
+
"python": "CPython 3.10.19",
|
| 4 |
+
"startedAt": "2025-11-13T09:16:24.807116Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--model_name",
|
| 7 |
+
"Qwen/Qwen2.5-Math-1.5B",
|
| 8 |
+
"--dataset_name",
|
| 9 |
+
"openr1",
|
| 10 |
+
"--output_dir",
|
| 11 |
+
"./model_sft_save/Qwen2.5-Math-1.5B-Full-solution",
|
| 12 |
+
"--batch_size",
|
| 13 |
+
"2",
|
| 14 |
+
"--grad_accum",
|
| 15 |
+
"4",
|
| 16 |
+
"--learning_rate",
|
| 17 |
+
"5e-6",
|
| 18 |
+
"--epochs",
|
| 19 |
+
"1",
|
| 20 |
+
"--use_rope_scaling",
|
| 21 |
+
"--use_deepspeed",
|
| 22 |
+
"--deepspeed_config",
|
| 23 |
+
"deepspeed/dp_stage2.json",
|
| 24 |
+
"--use_wandb",
|
| 25 |
+
"--wandb_project",
|
| 26 |
+
"qwen-math-sft",
|
| 27 |
+
"--wandb_run_name",
|
| 28 |
+
"qwen2.5-1.5b-46k-fft-solution"
|
| 29 |
+
],
|
| 30 |
+
"program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k.py",
|
| 31 |
+
"codePath": "train_qwen_46k.py",
|
| 32 |
+
"codePathLocal": "train_qwen_46k.py",
|
| 33 |
+
"email": "yaning1001@gmail.com",
|
| 34 |
+
"root": "/public/home/lshi/yoAI/projects/Online_CL/train",
|
| 35 |
+
"host": "gpu-h100-07",
|
| 36 |
+
"executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
|
| 37 |
+
"cpu_count": 96,
|
| 38 |
+
"cpu_count_logical": 96,
|
| 39 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 40 |
+
"gpu_count": 8,
|
| 41 |
+
"disk": {
|
| 42 |
+
"/": {
|
| 43 |
+
"total": "469407801344",
|
| 44 |
+
"used": "289841229824"
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"memory": {
|
| 48 |
+
"total": "2164142350336"
|
| 49 |
+
},
|
| 50 |
+
"gpu_nvidia": [
|
| 51 |
+
{
|
| 52 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 53 |
+
"memoryTotal": "85520809984",
|
| 54 |
+
"cudaCores": 16896,
|
| 55 |
+
"architecture": "Hopper",
|
| 56 |
+
"uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 60 |
+
"memoryTotal": "85520809984",
|
| 61 |
+
"cudaCores": 16896,
|
| 62 |
+
"architecture": "Hopper",
|
| 63 |
+
"uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 67 |
+
"memoryTotal": "85520809984",
|
| 68 |
+
"cudaCores": 16896,
|
| 69 |
+
"architecture": "Hopper",
|
| 70 |
+
"uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 74 |
+
"memoryTotal": "85520809984",
|
| 75 |
+
"cudaCores": 16896,
|
| 76 |
+
"architecture": "Hopper",
|
| 77 |
+
"uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 81 |
+
"memoryTotal": "85520809984",
|
| 82 |
+
"cudaCores": 16896,
|
| 83 |
+
"architecture": "Hopper",
|
| 84 |
+
"uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 88 |
+
"memoryTotal": "85520809984",
|
| 89 |
+
"cudaCores": 16896,
|
| 90 |
+
"architecture": "Hopper",
|
| 91 |
+
"uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 95 |
+
"memoryTotal": "85520809984",
|
| 96 |
+
"cudaCores": 16896,
|
| 97 |
+
"architecture": "Hopper",
|
| 98 |
+
"uuid": "GPU-23628f74-fede-6431-ae15-2764fce29130"
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 102 |
+
"memoryTotal": "85520809984",
|
| 103 |
+
"cudaCores": 16896,
|
| 104 |
+
"architecture": "Hopper",
|
| 105 |
+
"uuid": "GPU-d18d570f-dd0f-0ff6-3401-561c9e799136"
|
| 106 |
+
}
|
| 107 |
+
],
|
| 108 |
+
"cudaVersion": "12.4",
|
| 109 |
+
"slurm": {
|
| 110 |
+
"home": "/opt/gridview/slurm",
|
| 111 |
+
"pmix_direct_conn": "true",
|
| 112 |
+
"pmix_direct_conn_early": "false",
|
| 113 |
+
"pmix_direct_conn_ucx": "false",
|
| 114 |
+
"pmix_timeout": "3000"
|
| 115 |
+
},
|
| 116 |
+
"writerId": "tp3ukl31hffsi4h6hmw62zh3mmvd4ck1"
|
| 117 |
+
}
|
train/wandb/run-20251114_040305-jb702f8e/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":16984},"_runtime":16984}
|
train/wandb/run-20251114_083110-mocjk23v/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":1157},"_runtime":1157}
|
train/wandb/run-20251114_085634-l1whc2fu/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-14T08:56:34.99307817+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmps5yzjy9e/port-683755.txt","pid":683755,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-11-14T08:56:34.993470483+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-683755-684391-740002659/socket","Net":"unix"}}
|
| 3 |
+
{"time":"2025-11-14T08:56:34.993683243+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":683755}
|
| 4 |
+
{"time":"2025-11-14T08:56:35.174882292+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-11-14T08:56:35.188631623+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"l1whc2fu","id":"1(@)"}
|
| 6 |
+
{"time":"2025-11-14T08:56:36.5336818+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"l1whc2fu","id":"1(@)"}
|
| 7 |
+
{"time":"2025-11-14T08:57:28.613569258+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-11-14T08:57:28.613821906+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2025-11-14T08:57:28.613882852+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 10 |
+
{"time":"2025-11-14T08:57:28.613904195+08:00","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2025-11-14T08:57:28.613981514+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-683755-684391-740002659/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-11-14T08:57:31.434572086+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-11-14T08:57:31.43461066+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-11-14T08:57:31.43463779+08:00","level":"INFO","msg":"server is closed"}
|
train/wandb/run-20251114_093516-syhj5u87/files/config.yaml
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.22.3
|
| 4 |
+
e:
|
| 5 |
+
8wbmn2ipfnapymj9hqf701ll90rovymt:
|
| 6 |
+
args:
|
| 7 |
+
- --model_name
|
| 8 |
+
- Qwen/Qwen2.5-1.5B
|
| 9 |
+
- --dataset_path
|
| 10 |
+
- ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 11 |
+
- --output_dir
|
| 12 |
+
- ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 13 |
+
- --batch_size
|
| 14 |
+
- "2"
|
| 15 |
+
- --grad_accum
|
| 16 |
+
- "4"
|
| 17 |
+
- --learning_rate
|
| 18 |
+
- "5e-6"
|
| 19 |
+
- --epochs
|
| 20 |
+
- "1"
|
| 21 |
+
- --use_entropy_weighting
|
| 22 |
+
- --teacher_model_path
|
| 23 |
+
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 24 |
+
- --entropy_weight_alpha
|
| 25 |
+
- "2.0"
|
| 26 |
+
- --entropy_weight_beta
|
| 27 |
+
- "0.3"
|
| 28 |
+
- --teacher_dtype
|
| 29 |
+
- bfloat16
|
| 30 |
+
- --use_deepspeed
|
| 31 |
+
- --deepspeed_config
|
| 32 |
+
- deepspeed/dp_stage2.json
|
| 33 |
+
- --use_wandb
|
| 34 |
+
- --wandb_project
|
| 35 |
+
- qwen-math-entropy-sft
|
| 36 |
+
- --wandb_run_name
|
| 37 |
+
- qwen2.5-1.5b-46k-entropy-solution
|
| 38 |
+
codePath: train_qwen_46k_weight.py
|
| 39 |
+
codePathLocal: train_qwen_46k_weight.py
|
| 40 |
+
cpu_count: 96
|
| 41 |
+
cpu_count_logical: 96
|
| 42 |
+
cudaVersion: "12.4"
|
| 43 |
+
disk:
|
| 44 |
+
/:
|
| 45 |
+
total: "469407801344"
|
| 46 |
+
used: "288164167680"
|
| 47 |
+
email: yaning1001@gmail.com
|
| 48 |
+
executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
|
| 49 |
+
gpu: NVIDIA H100 80GB HBM3
|
| 50 |
+
gpu_count: 6
|
| 51 |
+
gpu_nvidia:
|
| 52 |
+
- architecture: Hopper
|
| 53 |
+
cudaCores: 16896
|
| 54 |
+
memoryTotal: "85520809984"
|
| 55 |
+
name: NVIDIA H100 80GB HBM3
|
| 56 |
+
uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
|
| 57 |
+
- architecture: Hopper
|
| 58 |
+
cudaCores: 16896
|
| 59 |
+
memoryTotal: "85520809984"
|
| 60 |
+
name: NVIDIA H100 80GB HBM3
|
| 61 |
+
uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
|
| 62 |
+
- architecture: Hopper
|
| 63 |
+
cudaCores: 16896
|
| 64 |
+
memoryTotal: "85520809984"
|
| 65 |
+
name: NVIDIA H100 80GB HBM3
|
| 66 |
+
uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
|
| 67 |
+
- architecture: Hopper
|
| 68 |
+
cudaCores: 16896
|
| 69 |
+
memoryTotal: "85520809984"
|
| 70 |
+
name: NVIDIA H100 80GB HBM3
|
| 71 |
+
uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
|
| 72 |
+
- architecture: Hopper
|
| 73 |
+
cudaCores: 16896
|
| 74 |
+
memoryTotal: "85520809984"
|
| 75 |
+
name: NVIDIA H100 80GB HBM3
|
| 76 |
+
uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
|
| 77 |
+
- architecture: Hopper
|
| 78 |
+
cudaCores: 16896
|
| 79 |
+
memoryTotal: "85520809984"
|
| 80 |
+
name: NVIDIA H100 80GB HBM3
|
| 81 |
+
uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
|
| 82 |
+
host: gpu-h100-07
|
| 83 |
+
memory:
|
| 84 |
+
total: "2164142350336"
|
| 85 |
+
os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
|
| 86 |
+
program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
|
| 87 |
+
python: CPython 3.10.19
|
| 88 |
+
root: /public/home/lshi/yoAI/projects/Online_CL/train
|
| 89 |
+
slurm:
|
| 90 |
+
cluster_name: cluster_admin1
|
| 91 |
+
conf: /opt/gridview/slurm/etc/slurm.conf
|
| 92 |
+
cpu_bind: quiet,mask_cpu:0x000000000000000000000001
|
| 93 |
+
cpu_bind_list: "0x000000000000000000000001"
|
| 94 |
+
cpu_bind_type: 'mask_cpu:'
|
| 95 |
+
cpu_bind_verbose: quiet
|
| 96 |
+
cpus_on_node: "1"
|
| 97 |
+
distribution: cyclic
|
| 98 |
+
gtids: "0"
|
| 99 |
+
home: /opt/gridview/slurm
|
| 100 |
+
job_account: seu_qli
|
| 101 |
+
job_cpus_per_node: "1"
|
| 102 |
+
job_gid: "2026"
|
| 103 |
+
job_id: "8428"
|
| 104 |
+
job_name: bash
|
| 105 |
+
job_nodelist: gpu-h100-07
|
| 106 |
+
job_num_nodes: "1"
|
| 107 |
+
job_partition: H100
|
| 108 |
+
job_qos: normal
|
| 109 |
+
job_uid: "2019"
|
| 110 |
+
job_user: lshi
|
| 111 |
+
jobid: "8428"
|
| 112 |
+
launch_node_ipaddr: 172.16.254.194
|
| 113 |
+
localid: "0"
|
| 114 |
+
nnodes: "1"
|
| 115 |
+
nodeid: "0"
|
| 116 |
+
nodelist: gpu-h100-07
|
| 117 |
+
nprocs: "1"
|
| 118 |
+
ntasks: "1"
|
| 119 |
+
pmix_direct_conn: "true"
|
| 120 |
+
pmix_direct_conn_early: "false"
|
| 121 |
+
pmix_direct_conn_ucx: "false"
|
| 122 |
+
pmix_timeout: "3000"
|
| 123 |
+
prio_process: "0"
|
| 124 |
+
procid: "0"
|
| 125 |
+
pty_port: "43139"
|
| 126 |
+
pty_win_col: "146"
|
| 127 |
+
pty_win_row: "21"
|
| 128 |
+
srun_comm_host: 172.16.254.194
|
| 129 |
+
srun_comm_port: "34989"
|
| 130 |
+
step_gpus: 0,1,2,3,4,5
|
| 131 |
+
step_id: "0"
|
| 132 |
+
step_launcher_port: "34989"
|
| 133 |
+
step_nodelist: gpu-h100-07
|
| 134 |
+
step_num_nodes: "1"
|
| 135 |
+
step_num_tasks: "1"
|
| 136 |
+
step_tasks_per_node: "1"
|
| 137 |
+
stepid: "0"
|
| 138 |
+
submit_dir: /public/home/lshi/yoAI/projects
|
| 139 |
+
submit_host: admin1
|
| 140 |
+
task_pid: "649671"
|
| 141 |
+
tasks_per_node: "1"
|
| 142 |
+
topology_addr: gpu-h100-07
|
| 143 |
+
topology_addr_pattern: node
|
| 144 |
+
umask: "0022"
|
| 145 |
+
working_cluster: cluster_admin1:172.16.254.194:6817:9216:101
|
| 146 |
+
startedAt: "2025-11-14T01:35:16.982120Z"
|
| 147 |
+
writerId: 8wbmn2ipfnapymj9hqf701ll90rovymt
|
| 148 |
+
m: []
|
| 149 |
+
python_version: 3.10.19
|
| 150 |
+
t:
|
| 151 |
+
"1":
|
| 152 |
+
- 1
|
| 153 |
+
- 11
|
| 154 |
+
- 41
|
| 155 |
+
- 49
|
| 156 |
+
- 51
|
| 157 |
+
- 71
|
| 158 |
+
- 84
|
| 159 |
+
- 98
|
| 160 |
+
- 105
|
| 161 |
+
"2":
|
| 162 |
+
- 1
|
| 163 |
+
- 11
|
| 164 |
+
- 41
|
| 165 |
+
- 49
|
| 166 |
+
- 51
|
| 167 |
+
- 71
|
| 168 |
+
- 84
|
| 169 |
+
- 98
|
| 170 |
+
- 105
|
| 171 |
+
"3":
|
| 172 |
+
- 13
|
| 173 |
+
"4": 3.10.19
|
| 174 |
+
"5": 0.22.3
|
| 175 |
+
"6": 4.57.1
|
| 176 |
+
"10":
|
| 177 |
+
- 20
|
| 178 |
+
"12": 0.22.3
|
| 179 |
+
"13": linux-x86_64
|
train/wandb/run-20251114_093516-syhj5u87/logs/debug-core.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-14T09:35:17.03441142+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpw3beozyg/port-732696.txt","pid":732696,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-11-14T09:35:17.034758287+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-732696-733205-3319755749/socket","Net":"unix"}}
|
| 3 |
+
{"time":"2025-11-14T09:35:17.034951673+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":732696}
|
| 4 |
+
{"time":"2025-11-14T09:35:17.211537499+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-11-14T09:35:17.220353582+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"syhj5u87","id":"1(@)"}
|
| 6 |
+
{"time":"2025-11-14T09:35:18.22989479+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"syhj5u87","id":"1(@)"}
|
| 7 |
+
{"time":"2025-11-14T09:36:02.014981461+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-11-14T09:36:02.015245162+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2025-11-14T09:36:02.015313878+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 10 |
+
{"time":"2025-11-14T09:36:02.015339332+08:00","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2025-11-14T09:36:02.015416344+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-732696-733205-3319755749/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-11-14T09:36:06.194928818+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
train/wandb/run-20251114_103643-cvm4116u/files/config.yaml
ADDED
|
@@ -0,0 +1,698 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: Qwen/Qwen2.5-1.5B
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.22.3
|
| 6 |
+
e:
|
| 7 |
+
7c7ug24k5x5ruwahnf33q520l30fph0h:
|
| 8 |
+
args:
|
| 9 |
+
- --model_name
|
| 10 |
+
- Qwen/Qwen2.5-1.5B
|
| 11 |
+
- --dataset_path
|
| 12 |
+
- ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 13 |
+
- --output_dir
|
| 14 |
+
- ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 15 |
+
- --batch_size
|
| 16 |
+
- "2"
|
| 17 |
+
- --grad_accum
|
| 18 |
+
- "4"
|
| 19 |
+
- --learning_rate
|
| 20 |
+
- "5e-6"
|
| 21 |
+
- --epochs
|
| 22 |
+
- "1"
|
| 23 |
+
- --use_entropy_weighting
|
| 24 |
+
- --teacher_model_path
|
| 25 |
+
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 26 |
+
- --entropy_weight_alpha
|
| 27 |
+
- "2.0"
|
| 28 |
+
- --entropy_weight_beta
|
| 29 |
+
- "0.3"
|
| 30 |
+
- --teacher_dtype
|
| 31 |
+
- bfloat16
|
| 32 |
+
- --use_deepspeed
|
| 33 |
+
- --deepspeed_config
|
| 34 |
+
- deepspeed/dp_stage2.json
|
| 35 |
+
- --use_wandb
|
| 36 |
+
- --wandb_project
|
| 37 |
+
- qwen-math-entropy-sft
|
| 38 |
+
- --wandb_run_name
|
| 39 |
+
- qwen2.5-1.5b-46k-entropy-solution
|
| 40 |
+
codePath: train_qwen_46k_weight.py
|
| 41 |
+
codePathLocal: train_qwen_46k_weight.py
|
| 42 |
+
cpu_count: 96
|
| 43 |
+
cpu_count_logical: 96
|
| 44 |
+
cudaVersion: "12.4"
|
| 45 |
+
disk:
|
| 46 |
+
/:
|
| 47 |
+
total: "469407801344"
|
| 48 |
+
used: "288221097984"
|
| 49 |
+
email: yaning1001@gmail.com
|
| 50 |
+
executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
|
| 51 |
+
gpu: NVIDIA H100 80GB HBM3
|
| 52 |
+
gpu_count: 6
|
| 53 |
+
gpu_nvidia:
|
| 54 |
+
- architecture: Hopper
|
| 55 |
+
cudaCores: 16896
|
| 56 |
+
memoryTotal: "85520809984"
|
| 57 |
+
name: NVIDIA H100 80GB HBM3
|
| 58 |
+
uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
|
| 59 |
+
- architecture: Hopper
|
| 60 |
+
cudaCores: 16896
|
| 61 |
+
memoryTotal: "85520809984"
|
| 62 |
+
name: NVIDIA H100 80GB HBM3
|
| 63 |
+
uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
|
| 64 |
+
- architecture: Hopper
|
| 65 |
+
cudaCores: 16896
|
| 66 |
+
memoryTotal: "85520809984"
|
| 67 |
+
name: NVIDIA H100 80GB HBM3
|
| 68 |
+
uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
|
| 69 |
+
- architecture: Hopper
|
| 70 |
+
cudaCores: 16896
|
| 71 |
+
memoryTotal: "85520809984"
|
| 72 |
+
name: NVIDIA H100 80GB HBM3
|
| 73 |
+
uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
|
| 74 |
+
- architecture: Hopper
|
| 75 |
+
cudaCores: 16896
|
| 76 |
+
memoryTotal: "85520809984"
|
| 77 |
+
name: NVIDIA H100 80GB HBM3
|
| 78 |
+
uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
|
| 79 |
+
- architecture: Hopper
|
| 80 |
+
cudaCores: 16896
|
| 81 |
+
memoryTotal: "85520809984"
|
| 82 |
+
name: NVIDIA H100 80GB HBM3
|
| 83 |
+
uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
|
| 84 |
+
host: gpu-h100-07
|
| 85 |
+
memory:
|
| 86 |
+
total: "2164142350336"
|
| 87 |
+
os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
|
| 88 |
+
program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
|
| 89 |
+
python: CPython 3.10.19
|
| 90 |
+
root: /public/home/lshi/yoAI/projects/Online_CL/train
|
| 91 |
+
slurm:
|
| 92 |
+
cluster_name: cluster_admin1
|
| 93 |
+
conf: /opt/gridview/slurm/etc/slurm.conf
|
| 94 |
+
cpu_bind: quiet,mask_cpu:0x000000000000000000000001
|
| 95 |
+
cpu_bind_list: "0x000000000000000000000001"
|
| 96 |
+
cpu_bind_type: 'mask_cpu:'
|
| 97 |
+
cpu_bind_verbose: quiet
|
| 98 |
+
cpus_on_node: "1"
|
| 99 |
+
distribution: cyclic
|
| 100 |
+
gtids: "0"
|
| 101 |
+
home: /opt/gridview/slurm
|
| 102 |
+
job_account: seu_qli
|
| 103 |
+
job_cpus_per_node: "1"
|
| 104 |
+
job_gid: "2026"
|
| 105 |
+
job_id: "8428"
|
| 106 |
+
job_name: bash
|
| 107 |
+
job_nodelist: gpu-h100-07
|
| 108 |
+
job_num_nodes: "1"
|
| 109 |
+
job_partition: H100
|
| 110 |
+
job_qos: normal
|
| 111 |
+
job_uid: "2019"
|
| 112 |
+
job_user: lshi
|
| 113 |
+
jobid: "8428"
|
| 114 |
+
launch_node_ipaddr: 172.16.254.194
|
| 115 |
+
localid: "0"
|
| 116 |
+
nnodes: "1"
|
| 117 |
+
nodeid: "0"
|
| 118 |
+
nodelist: gpu-h100-07
|
| 119 |
+
nprocs: "1"
|
| 120 |
+
ntasks: "1"
|
| 121 |
+
pmix_direct_conn: "true"
|
| 122 |
+
pmix_direct_conn_early: "false"
|
| 123 |
+
pmix_direct_conn_ucx: "false"
|
| 124 |
+
pmix_timeout: "3000"
|
| 125 |
+
prio_process: "0"
|
| 126 |
+
procid: "0"
|
| 127 |
+
pty_port: "43139"
|
| 128 |
+
pty_win_col: "146"
|
| 129 |
+
pty_win_row: "21"
|
| 130 |
+
srun_comm_host: 172.16.254.194
|
| 131 |
+
srun_comm_port: "34989"
|
| 132 |
+
step_gpus: 0,1,2,3,4,5
|
| 133 |
+
step_id: "0"
|
| 134 |
+
step_launcher_port: "34989"
|
| 135 |
+
step_nodelist: gpu-h100-07
|
| 136 |
+
step_num_nodes: "1"
|
| 137 |
+
step_num_tasks: "1"
|
| 138 |
+
step_tasks_per_node: "1"
|
| 139 |
+
stepid: "0"
|
| 140 |
+
submit_dir: /public/home/lshi/yoAI/projects
|
| 141 |
+
submit_host: admin1
|
| 142 |
+
task_pid: "649671"
|
| 143 |
+
tasks_per_node: "1"
|
| 144 |
+
topology_addr: gpu-h100-07
|
| 145 |
+
topology_addr_pattern: node
|
| 146 |
+
umask: "0022"
|
| 147 |
+
working_cluster: cluster_admin1:172.16.254.194:6817:9216:101
|
| 148 |
+
startedAt: "2025-11-14T02:36:43.740743Z"
|
| 149 |
+
writerId: 7c7ug24k5x5ruwahnf33q520l30fph0h
|
| 150 |
+
m:
|
| 151 |
+
- "1": train/global_step
|
| 152 |
+
"6":
|
| 153 |
+
- 3
|
| 154 |
+
"7": []
|
| 155 |
+
- "2": '*'
|
| 156 |
+
"5": 1
|
| 157 |
+
"6":
|
| 158 |
+
- 1
|
| 159 |
+
"7": []
|
| 160 |
+
python_version: 3.10.19
|
| 161 |
+
t:
|
| 162 |
+
"1":
|
| 163 |
+
- 1
|
| 164 |
+
- 11
|
| 165 |
+
- 41
|
| 166 |
+
- 49
|
| 167 |
+
- 51
|
| 168 |
+
- 71
|
| 169 |
+
- 84
|
| 170 |
+
- 98
|
| 171 |
+
- 105
|
| 172 |
+
"2":
|
| 173 |
+
- 1
|
| 174 |
+
- 11
|
| 175 |
+
- 41
|
| 176 |
+
- 49
|
| 177 |
+
- 51
|
| 178 |
+
- 71
|
| 179 |
+
- 84
|
| 180 |
+
- 98
|
| 181 |
+
- 105
|
| 182 |
+
"3":
|
| 183 |
+
- 7
|
| 184 |
+
- 13
|
| 185 |
+
- 19
|
| 186 |
+
- 66
|
| 187 |
+
"4": 3.10.19
|
| 188 |
+
"5": 0.22.3
|
| 189 |
+
"6": 4.57.1
|
| 190 |
+
"9":
|
| 191 |
+
"1": transformers_trainer
|
| 192 |
+
"10":
|
| 193 |
+
- 20
|
| 194 |
+
"12": 0.22.3
|
| 195 |
+
"13": linux-x86_64
|
| 196 |
+
accelerator_config:
|
| 197 |
+
value:
|
| 198 |
+
dispatch_batches: null
|
| 199 |
+
even_batches: true
|
| 200 |
+
gradient_accumulation_kwargs: null
|
| 201 |
+
non_blocking: false
|
| 202 |
+
split_batches: false
|
| 203 |
+
use_seedable_sampler: true
|
| 204 |
+
activation_offloading:
|
| 205 |
+
value: false
|
| 206 |
+
adafactor:
|
| 207 |
+
value: false
|
| 208 |
+
adam_beta1:
|
| 209 |
+
value: 0.9
|
| 210 |
+
adam_beta2:
|
| 211 |
+
value: 0.999
|
| 212 |
+
adam_epsilon:
|
| 213 |
+
value: 1e-08
|
| 214 |
+
add_cross_attention:
|
| 215 |
+
value: false
|
| 216 |
+
architectures:
|
| 217 |
+
value:
|
| 218 |
+
- Qwen2ForCausalLM
|
| 219 |
+
assistant_only_loss:
|
| 220 |
+
value: false
|
| 221 |
+
attention_dropout:
|
| 222 |
+
value: 0
|
| 223 |
+
auto_find_batch_size:
|
| 224 |
+
value: false
|
| 225 |
+
average_tokens_across_devices:
|
| 226 |
+
value: true
|
| 227 |
+
bad_words_ids:
|
| 228 |
+
value: null
|
| 229 |
+
batch_eval_metrics:
|
| 230 |
+
value: false
|
| 231 |
+
begin_suppress_tokens:
|
| 232 |
+
value: null
|
| 233 |
+
bf16:
|
| 234 |
+
value: true
|
| 235 |
+
bf16_full_eval:
|
| 236 |
+
value: false
|
| 237 |
+
bos_token_id:
|
| 238 |
+
value: null
|
| 239 |
+
chat_template_path:
|
| 240 |
+
value: null
|
| 241 |
+
chunk_size_feed_forward:
|
| 242 |
+
value: 0
|
| 243 |
+
completion_only_loss:
|
| 244 |
+
value: null
|
| 245 |
+
cross_attention_hidden_size:
|
| 246 |
+
value: null
|
| 247 |
+
data_seed:
|
| 248 |
+
value: null
|
| 249 |
+
dataloader_drop_last:
|
| 250 |
+
value: false
|
| 251 |
+
dataloader_num_workers:
|
| 252 |
+
value: 0
|
| 253 |
+
dataloader_persistent_workers:
|
| 254 |
+
value: false
|
| 255 |
+
dataloader_pin_memory:
|
| 256 |
+
value: true
|
| 257 |
+
dataloader_prefetch_factor:
|
| 258 |
+
value: null
|
| 259 |
+
dataset_kwargs:
|
| 260 |
+
value: null
|
| 261 |
+
dataset_num_proc:
|
| 262 |
+
value: null
|
| 263 |
+
dataset_text_field:
|
| 264 |
+
value: null
|
| 265 |
+
ddp_backend:
|
| 266 |
+
value: null
|
| 267 |
+
ddp_broadcast_buffers:
|
| 268 |
+
value: null
|
| 269 |
+
ddp_bucket_cap_mb:
|
| 270 |
+
value: null
|
| 271 |
+
ddp_find_unused_parameters:
|
| 272 |
+
value: null
|
| 273 |
+
ddp_timeout:
|
| 274 |
+
value: 1800
|
| 275 |
+
debug:
|
| 276 |
+
value: []
|
| 277 |
+
decoder_start_token_id:
|
| 278 |
+
value: null
|
| 279 |
+
deepspeed:
|
| 280 |
+
value: deepspeed/dp_stage2.json
|
| 281 |
+
disable_tqdm:
|
| 282 |
+
value: false
|
| 283 |
+
diversity_penalty:
|
| 284 |
+
value: 0
|
| 285 |
+
do_eval:
|
| 286 |
+
value: true
|
| 287 |
+
do_predict:
|
| 288 |
+
value: false
|
| 289 |
+
do_sample:
|
| 290 |
+
value: false
|
| 291 |
+
do_train:
|
| 292 |
+
value: false
|
| 293 |
+
dtype:
|
| 294 |
+
value: bfloat16
|
| 295 |
+
early_stopping:
|
| 296 |
+
value: false
|
| 297 |
+
encoder_no_repeat_ngram_size:
|
| 298 |
+
value: 0
|
| 299 |
+
entropy_top_k:
|
| 300 |
+
value: 64
|
| 301 |
+
entropy_weight_alpha:
|
| 302 |
+
value: 2
|
| 303 |
+
entropy_weight_beta:
|
| 304 |
+
value: 0.3
|
| 305 |
+
eos_token:
|
| 306 |
+
value: <EOS_TOKEN>
|
| 307 |
+
eos_token_id:
|
| 308 |
+
value: 151643
|
| 309 |
+
eval_accumulation_steps:
|
| 310 |
+
value: null
|
| 311 |
+
eval_delay:
|
| 312 |
+
value: 0
|
| 313 |
+
eval_do_concat_batches:
|
| 314 |
+
value: true
|
| 315 |
+
eval_on_start:
|
| 316 |
+
value: false
|
| 317 |
+
eval_packing:
|
| 318 |
+
value: null
|
| 319 |
+
eval_steps:
|
| 320 |
+
value: 25
|
| 321 |
+
eval_strategy:
|
| 322 |
+
value: steps
|
| 323 |
+
eval_use_gather_object:
|
| 324 |
+
value: false
|
| 325 |
+
exponential_decay_length_penalty:
|
| 326 |
+
value: null
|
| 327 |
+
finetuning_task:
|
| 328 |
+
value: null
|
| 329 |
+
forced_bos_token_id:
|
| 330 |
+
value: null
|
| 331 |
+
forced_eos_token_id:
|
| 332 |
+
value: null
|
| 333 |
+
fp16:
|
| 334 |
+
value: false
|
| 335 |
+
fp16_backend:
|
| 336 |
+
value: auto
|
| 337 |
+
fp16_full_eval:
|
| 338 |
+
value: false
|
| 339 |
+
fp16_opt_level:
|
| 340 |
+
value: O1
|
| 341 |
+
fsdp:
|
| 342 |
+
value: []
|
| 343 |
+
fsdp_config:
|
| 344 |
+
value:
|
| 345 |
+
min_num_params: 0
|
| 346 |
+
xla: false
|
| 347 |
+
xla_fsdp_grad_ckpt: false
|
| 348 |
+
xla_fsdp_v2: false
|
| 349 |
+
fsdp_min_num_params:
|
| 350 |
+
value: 0
|
| 351 |
+
fsdp_transformer_layer_cls_to_wrap:
|
| 352 |
+
value: null
|
| 353 |
+
full_determinism:
|
| 354 |
+
value: false
|
| 355 |
+
gradient_accumulation_steps:
|
| 356 |
+
value: 4
|
| 357 |
+
gradient_checkpointing:
|
| 358 |
+
value: true
|
| 359 |
+
gradient_checkpointing_kwargs:
|
| 360 |
+
value:
|
| 361 |
+
use_reentrant: false
|
| 362 |
+
greater_is_better:
|
| 363 |
+
value: false
|
| 364 |
+
group_by_length:
|
| 365 |
+
value: false
|
| 366 |
+
half_precision_backend:
|
| 367 |
+
value: auto
|
| 368 |
+
hidden_act:
|
| 369 |
+
value: silu
|
| 370 |
+
hidden_size:
|
| 371 |
+
value: 1536
|
| 372 |
+
hub_always_push:
|
| 373 |
+
value: false
|
| 374 |
+
hub_model_id:
|
| 375 |
+
value: null
|
| 376 |
+
hub_private_repo:
|
| 377 |
+
value: null
|
| 378 |
+
hub_revision:
|
| 379 |
+
value: null
|
| 380 |
+
hub_strategy:
|
| 381 |
+
value: every_save
|
| 382 |
+
hub_token:
|
| 383 |
+
value: <HUB_TOKEN>
|
| 384 |
+
id2label:
|
| 385 |
+
value:
|
| 386 |
+
"0": LABEL_0
|
| 387 |
+
"1": LABEL_1
|
| 388 |
+
ignore_data_skip:
|
| 389 |
+
value: false
|
| 390 |
+
include_for_metrics:
|
| 391 |
+
value: []
|
| 392 |
+
include_inputs_for_metrics:
|
| 393 |
+
value: false
|
| 394 |
+
include_num_input_tokens_seen:
|
| 395 |
+
value: "no"
|
| 396 |
+
include_tokens_per_second:
|
| 397 |
+
value: false
|
| 398 |
+
initializer_range:
|
| 399 |
+
value: 0.02
|
| 400 |
+
intermediate_size:
|
| 401 |
+
value: 8960
|
| 402 |
+
is_decoder:
|
| 403 |
+
value: false
|
| 404 |
+
is_encoder_decoder:
|
| 405 |
+
value: false
|
| 406 |
+
jit_mode_eval:
|
| 407 |
+
value: false
|
| 408 |
+
label_names:
|
| 409 |
+
value: null
|
| 410 |
+
label_smoothing_factor:
|
| 411 |
+
value: 0
|
| 412 |
+
label2id:
|
| 413 |
+
value:
|
| 414 |
+
LABEL_0: 0
|
| 415 |
+
LABEL_1: 1
|
| 416 |
+
layer_types:
|
| 417 |
+
value:
|
| 418 |
+
- full_attention
|
| 419 |
+
- full_attention
|
| 420 |
+
- full_attention
|
| 421 |
+
- full_attention
|
| 422 |
+
- full_attention
|
| 423 |
+
- full_attention
|
| 424 |
+
- full_attention
|
| 425 |
+
- full_attention
|
| 426 |
+
- full_attention
|
| 427 |
+
- full_attention
|
| 428 |
+
- full_attention
|
| 429 |
+
- full_attention
|
| 430 |
+
- full_attention
|
| 431 |
+
- full_attention
|
| 432 |
+
- full_attention
|
| 433 |
+
- full_attention
|
| 434 |
+
- full_attention
|
| 435 |
+
- full_attention
|
| 436 |
+
- full_attention
|
| 437 |
+
- full_attention
|
| 438 |
+
- full_attention
|
| 439 |
+
- full_attention
|
| 440 |
+
- full_attention
|
| 441 |
+
- full_attention
|
| 442 |
+
- full_attention
|
| 443 |
+
- full_attention
|
| 444 |
+
- full_attention
|
| 445 |
+
- full_attention
|
| 446 |
+
learning_rate:
|
| 447 |
+
value: 5e-06
|
| 448 |
+
length_column_name:
|
| 449 |
+
value: length
|
| 450 |
+
length_penalty:
|
| 451 |
+
value: 1
|
| 452 |
+
liger_kernel_config:
|
| 453 |
+
value: null
|
| 454 |
+
load_best_model_at_end:
|
| 455 |
+
value: true
|
| 456 |
+
local_rank:
|
| 457 |
+
value: 0
|
| 458 |
+
log_level:
|
| 459 |
+
value: passive
|
| 460 |
+
log_level_replica:
|
| 461 |
+
value: warning
|
| 462 |
+
log_on_each_node:
|
| 463 |
+
value: true
|
| 464 |
+
logging_dir:
|
| 465 |
+
value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_10-36-48_gpu-h100-07
|
| 466 |
+
logging_first_step:
|
| 467 |
+
value: false
|
| 468 |
+
logging_nan_inf_filter:
|
| 469 |
+
value: true
|
| 470 |
+
logging_steps:
|
| 471 |
+
value: 5
|
| 472 |
+
logging_strategy:
|
| 473 |
+
value: steps
|
| 474 |
+
loss_type:
|
| 475 |
+
value: nll
|
| 476 |
+
lr_scheduler_type:
|
| 477 |
+
value: cosine
|
| 478 |
+
max_grad_norm:
|
| 479 |
+
value: 1
|
| 480 |
+
max_length:
|
| 481 |
+
value: 8192
|
| 482 |
+
max_position_embeddings:
|
| 483 |
+
value: 131072
|
| 484 |
+
max_steps:
|
| 485 |
+
value: -1
|
| 486 |
+
max_window_layers:
|
| 487 |
+
value: 28
|
| 488 |
+
metric_for_best_model:
|
| 489 |
+
value: eval_loss
|
| 490 |
+
min_length:
|
| 491 |
+
value: 0
|
| 492 |
+
model/num_parameters:
|
| 493 |
+
value: 1543298048
|
| 494 |
+
model_init_kwargs:
|
| 495 |
+
value:
|
| 496 |
+
attn_implementation: sdpa
|
| 497 |
+
rope_scaling: null
|
| 498 |
+
torch_dtype: torch.bfloat16
|
| 499 |
+
trust_remote_code: true
|
| 500 |
+
model_type:
|
| 501 |
+
value: qwen2
|
| 502 |
+
mp_parameters:
|
| 503 |
+
value: ""
|
| 504 |
+
neftune_noise_alpha:
|
| 505 |
+
value: null
|
| 506 |
+
no_cuda:
|
| 507 |
+
value: false
|
| 508 |
+
no_repeat_ngram_size:
|
| 509 |
+
value: 0
|
| 510 |
+
num_attention_heads:
|
| 511 |
+
value: 12
|
| 512 |
+
num_beam_groups:
|
| 513 |
+
value: 1
|
| 514 |
+
num_beams:
|
| 515 |
+
value: 1
|
| 516 |
+
num_hidden_layers:
|
| 517 |
+
value: 28
|
| 518 |
+
num_key_value_heads:
|
| 519 |
+
value: 2
|
| 520 |
+
num_return_sequences:
|
| 521 |
+
value: 1
|
| 522 |
+
num_train_epochs:
|
| 523 |
+
value: 1
|
| 524 |
+
optim:
|
| 525 |
+
value: adamw_torch
|
| 526 |
+
optim_args:
|
| 527 |
+
value: null
|
| 528 |
+
optim_target_modules:
|
| 529 |
+
value: null
|
| 530 |
+
output_attentions:
|
| 531 |
+
value: false
|
| 532 |
+
output_dir:
|
| 533 |
+
value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 534 |
+
output_hidden_states:
|
| 535 |
+
value: false
|
| 536 |
+
output_scores:
|
| 537 |
+
value: false
|
| 538 |
+
overwrite_output_dir:
|
| 539 |
+
value: false
|
| 540 |
+
packing:
|
| 541 |
+
value: false
|
| 542 |
+
packing_strategy:
|
| 543 |
+
value: bfd
|
| 544 |
+
pad_to_multiple_of:
|
| 545 |
+
value: null
|
| 546 |
+
pad_token:
|
| 547 |
+
value: <PAD_TOKEN>
|
| 548 |
+
pad_token_id:
|
| 549 |
+
value: 151643
|
| 550 |
+
padding_free:
|
| 551 |
+
value: false
|
| 552 |
+
parallelism_config:
|
| 553 |
+
value: null
|
| 554 |
+
past_index:
|
| 555 |
+
value: -1
|
| 556 |
+
per_device_eval_batch_size:
|
| 557 |
+
value: 2
|
| 558 |
+
per_device_train_batch_size:
|
| 559 |
+
value: 2
|
| 560 |
+
per_gpu_eval_batch_size:
|
| 561 |
+
value: null
|
| 562 |
+
per_gpu_train_batch_size:
|
| 563 |
+
value: null
|
| 564 |
+
prediction_loss_only:
|
| 565 |
+
value: false
|
| 566 |
+
prefix:
|
| 567 |
+
value: null
|
| 568 |
+
problem_type:
|
| 569 |
+
value: null
|
| 570 |
+
project:
|
| 571 |
+
value: huggingface
|
| 572 |
+
push_to_hub:
|
| 573 |
+
value: false
|
| 574 |
+
push_to_hub_model_id:
|
| 575 |
+
value: null
|
| 576 |
+
push_to_hub_organization:
|
| 577 |
+
value: null
|
| 578 |
+
push_to_hub_token:
|
| 579 |
+
value: <PUSH_TO_HUB_TOKEN>
|
| 580 |
+
ray_scope:
|
| 581 |
+
value: last
|
| 582 |
+
remove_invalid_values:
|
| 583 |
+
value: false
|
| 584 |
+
remove_unused_columns:
|
| 585 |
+
value: true
|
| 586 |
+
repetition_penalty:
|
| 587 |
+
value: 1
|
| 588 |
+
report_to:
|
| 589 |
+
value:
|
| 590 |
+
- wandb
|
| 591 |
+
restore_callback_states_from_checkpoint:
|
| 592 |
+
value: false
|
| 593 |
+
resume_from_checkpoint:
|
| 594 |
+
value: null
|
| 595 |
+
return_dict:
|
| 596 |
+
value: true
|
| 597 |
+
return_dict_in_generate:
|
| 598 |
+
value: false
|
| 599 |
+
rms_norm_eps:
|
| 600 |
+
value: 1e-06
|
| 601 |
+
rope_scaling:
|
| 602 |
+
value: null
|
| 603 |
+
rope_theta:
|
| 604 |
+
value: 1e+06
|
| 605 |
+
run_name:
|
| 606 |
+
value: qwen2.5-1.5b-46k-entropy-solution
|
| 607 |
+
save_on_each_node:
|
| 608 |
+
value: false
|
| 609 |
+
save_only_model:
|
| 610 |
+
value: false
|
| 611 |
+
save_safetensors:
|
| 612 |
+
value: true
|
| 613 |
+
save_steps:
|
| 614 |
+
value: 50
|
| 615 |
+
save_strategy:
|
| 616 |
+
value: steps
|
| 617 |
+
save_total_limit:
|
| 618 |
+
value: 2
|
| 619 |
+
seed:
|
| 620 |
+
value: 42
|
| 621 |
+
sep_token_id:
|
| 622 |
+
value: null
|
| 623 |
+
skip_memory_metrics:
|
| 624 |
+
value: true
|
| 625 |
+
sliding_window:
|
| 626 |
+
value: null
|
| 627 |
+
suppress_tokens:
|
| 628 |
+
value: null
|
| 629 |
+
task_specific_params:
|
| 630 |
+
value: null
|
| 631 |
+
teacher_dtype:
|
| 632 |
+
value: bfloat16
|
| 633 |
+
teacher_model_path:
|
| 634 |
+
value: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 635 |
+
temperature:
|
| 636 |
+
value: 1
|
| 637 |
+
tf_legacy_loss:
|
| 638 |
+
value: false
|
| 639 |
+
tf32:
|
| 640 |
+
value: null
|
| 641 |
+
tie_encoder_decoder:
|
| 642 |
+
value: false
|
| 643 |
+
tie_word_embeddings:
|
| 644 |
+
value: true
|
| 645 |
+
tokenizer_class:
|
| 646 |
+
value: null
|
| 647 |
+
top_k:
|
| 648 |
+
value: 50
|
| 649 |
+
top_p:
|
| 650 |
+
value: 1
|
| 651 |
+
torch_compile:
|
| 652 |
+
value: false
|
| 653 |
+
torch_compile_backend:
|
| 654 |
+
value: null
|
| 655 |
+
torch_compile_mode:
|
| 656 |
+
value: null
|
| 657 |
+
torch_empty_cache_steps:
|
| 658 |
+
value: null
|
| 659 |
+
torchdynamo:
|
| 660 |
+
value: null
|
| 661 |
+
torchscript:
|
| 662 |
+
value: false
|
| 663 |
+
tpu_metrics_debug:
|
| 664 |
+
value: false
|
| 665 |
+
tpu_num_cores:
|
| 666 |
+
value: null
|
| 667 |
+
trackio_space_id:
|
| 668 |
+
value: trackio
|
| 669 |
+
transformers_version:
|
| 670 |
+
value: 4.57.1
|
| 671 |
+
typical_p:
|
| 672 |
+
value: 1
|
| 673 |
+
use_bfloat16:
|
| 674 |
+
value: false
|
| 675 |
+
use_cache:
|
| 676 |
+
value: true
|
| 677 |
+
use_cpu:
|
| 678 |
+
value: false
|
| 679 |
+
use_entropy_weighting:
|
| 680 |
+
value: true
|
| 681 |
+
use_legacy_prediction_loop:
|
| 682 |
+
value: false
|
| 683 |
+
use_liger_kernel:
|
| 684 |
+
value: false
|
| 685 |
+
use_mps_device:
|
| 686 |
+
value: false
|
| 687 |
+
use_mrope:
|
| 688 |
+
value: false
|
| 689 |
+
use_sliding_window:
|
| 690 |
+
value: false
|
| 691 |
+
vocab_size:
|
| 692 |
+
value: 151665
|
| 693 |
+
warmup_ratio:
|
| 694 |
+
value: 0.03
|
| 695 |
+
warmup_steps:
|
| 696 |
+
value: 0
|
| 697 |
+
weight_decay:
|
| 698 |
+
value: 0.01
|
train/wandb/run-20251114_103643-cvm4116u/files/output.log
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
📝 Loading tokenizer...
|
| 2 |
+
📦 Loading local dataset from: ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 3 |
+
📊 Train: 45334 | Eval: 458
|
| 4 |
+
🔄 Formatting dataset...
|
| 5 |
+
|
| 6 |
+
🔍 MASKED PART (prompt/question):
|
| 7 |
+
<|im_start|>system
|
| 8 |
+
Think step by step and solve the problem.<|im_end|>
|
| 9 |
+
<|im_start|>user
|
| 10 |
+
## Problem Statement
|
| 11 |
+
|
| 12 |
+
Calculate the definite integral:
|
| 13 |
+
|
| 14 |
+
$$
|
| 15 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6-\operatorname{tg} x) \sin 2 x}
|
| 16 |
+
$$<|im_end|>
|
| 17 |
+
<|im_start|>assistant
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
✅ TRAINED PART (solution):
|
| 21 |
+
<think>
|
| 22 |
+
Okay, so I need to compute the definite integral from π/4 to arccos(1/√26) of dx divided by (6 - tan x) times sin 2x. Hmm, let me start by recalling some integration techniques. The integral has tan x and sin 2x in the denominator. Maybe I can simplify the expression first or use substitution.
|
| 23 |
+
|
| 24 |
+
First, I know that sin 2x is equal to 2 sin x cos x. So, maybe rewriting sin 2x as 2 sin x cos x could help. Let me try that:
|
| 25 |
+
|
| 26 |
+
∫ [1 / ( (6 - tan x) * 2 sin x cos x ) ] dx
|
| 27 |
+
|
| 28 |
+
Simplifying the denominator, that becomes:
|
| 29 |
+
|
| 30 |
+
1 / [2 sin x cos x (6 - tan x)] dx
|
| 31 |
+
|
| 32 |
+
But tan x is sin x / cos x, so substituting that in:
|
| 33 |
+
|
| 34 |
+
Denominator: 2 sin x cos x (6 - sin x / cos x )
|
| 35 |
+
|
| 36 |
+
Let me combine the terms in the parenthesis:
|
| 37 |
+
|
| 38 |
+
6 - sin x / cos x = (6 cos x - sin x)/cos x
|
| 39 |
+
|
| 40 |
+
Therefore, the denominator becomes:
|
| 41 |
+
|
| 42 |
+
2 sin x cos x * (6 cos x - sin x)/cos x = 2 sin x (6 cos x - sin x)
|
| 43 |
+
|
| 44 |
+
So now, the integral simplifies to:
|
| 45 |
+
|
| 46 |
+
∫ [1 / (2 sin x (6 cos x - sin x)) ] dx
|
| 47 |
+
|
| 48 |
+
So the integral is now 1/(2 sin x (6 cos x - sin x)) dx. Maybe this is easier to integrate. Let me factor out the 1/2:
|
| 49 |
+
|
| 50 |
+
(1/2) �� [1 / (sin x (6 cos x - sin x)) ] dx
|
| 51 |
+
|
| 52 |
+
Hmm, this seems a bit complicated. Maybe I can use substitution here. Let me think about substitution. Let's let u = 6 cos x - sin x. Then, du/dx = -6 sin x - cos x. Hmm, not sure if that helps. Let's check:
|
| 53 |
+
|
| 54 |
+
If u = 6 cos x - sin x, then du = (-6 sin x - cos x) dx. Hmm, the integral has 1/(sin x u). So, maybe if I can express the integral in terms of du and u. But du has terms -6 sin x - cos x, which isn't directly present in the integral. Maybe this isn't the right substitution.
|
| 55 |
+
|
| 56 |
+
Alternatively, maybe split the fraction into partial fractions. Let me see. The denominator is sin x (6 cos x - sin x). Let me denote t = cos x or t = sin x. Maybe substitution t = sin x or t = cos x.
|
| 57 |
+
|
| 58 |
+
Alternatively, use substitution t = tan x. Let's try that. Let t = tan x. Then, dt/dx = sec²x = 1 + tan²x. So, dx = dt / (1 + t²). Also, sin 2x = 2 tan x / (1 + tan²x) = 2t / (1 + t²). Let me rewrite the integral in terms of t.
|
| 59 |
+
|
| 60 |
+
Original integral:
|
| 61 |
+
|
| 62 |
+
∫ [1 / ( (6 - t) * (2t / (1 + t²)) ) ] * [dt / (1 + t²)]
|
| 63 |
+
|
| 64 |
+
Wait, let's check. If t = tan x, then when x goes from π/4 to arccos(1/√26), t will go from tan(π/4) = 1 to tan(arccos(1/√26)). Let me compute tan(arccos(1/√26)). Let �� = arccos(1/√26). So cos �� = 1/√26, so sin �� = sqrt(1 - 1/26) = sqrt(25/26) = 5/√26. Therefore, tan �� = sin �� / cos �� = 5. So the upper limit is 5. So substitution t = tan x changes the limits from 1 to 5.
|
| 65 |
+
|
| 66 |
+
Now, let's rewrite the integral. The integrand is 1 / [ (6 - tan x) sin 2x ] dx.
|
| 67 |
+
|
| 68 |
+
Expressing in terms of t:
|
| 69 |
+
|
| 70 |
+
1 / [ (6 - t) * (2t / (1 + t²)) ] * (dt / (1 + t²))
|
| 71 |
+
|
| 72 |
+
Let me compute that step by step. First, sin 2x is 2t / (1 + t²). So, the denominator becomes (6 - t) * 2t / (1 + t²). Then, dx is dt / (1 + t²). So, multiplying all together:
|
| 73 |
+
|
| 74 |
+
Integral becomes �� [ (1 + t²) / (2t (6 - t)) ] * [ 1 / (1 + t²) ] dt from t=1 to t=5.
|
| 75 |
+
|
| 76 |
+
Simplify: The (1 + t²) cancels out. So, we have �� [1 / (2t (6 - t)) ] dt from 1 to 5.
|
| 77 |
+
|
| 78 |
+
So, the integral simplifies to (1/2) �� [1 / (t (6 - t)) ] dt from 1 to 5.
|
| 79 |
+
|
| 80 |
+
That's a much simpler integral. Now, let's compute �� [1 / (t (6 - t)) ] dt. We can use partial fractions here.
|
| 81 |
+
|
| 82 |
+
Express 1 / [ t (6 - t) ] as A/t + B/(6 - t). Let's find A and B.
|
| 83 |
+
|
| 84 |
+
1 = A(6 - t) + B t
|
| 85 |
+
|
| 86 |
+
Let t = 0: 1 = 6A => A = 1/6
|
| 87 |
+
|
| 88 |
+
Let t = 6: 1 = 6B => B = 1/6
|
| 89 |
+
|
| 90 |
+
Therefore, 1 / [ t (6 - t) ] = (1/6)/t + (1/6)/(6 - t)
|
| 91 |
+
|
| 92 |
+
Therefore, the integral becomes:
|
| 93 |
+
|
| 94 |
+
(1/2) �� [ (1/6)/t + (1/6)/(6 - t) ] dt from 1 to 5
|
| 95 |
+
|
| 96 |
+
Factor out 1/6:
|
| 97 |
+
|
| 98 |
+
(1/2)(1/6) �� [1/t + 1/(6 - t) ] dt = (1/12) �� [1/t + 1/(6 - t) ] dt
|
| 99 |
+
|
| 100 |
+
Integrate term by term:
|
| 101 |
+
|
| 102 |
+
∫1/t dt = ln |t| + C
|
| 103 |
+
|
| 104 |
+
∫1/(6 - t) dt = -ln |6 - t| + C
|
| 105 |
+
|
| 106 |
+
Therefore, the integral becomes:
|
| 107 |
+
|
| 108 |
+
(1/12) [ ln |t| - ln |6 - t| ] evaluated from 1 to 5.
|
| 109 |
+
|
| 110 |
+
So, evaluating from 1 to 5:
|
| 111 |
+
|
| 112 |
+
(1/12) [ (ln 5 - ln (6 - 5)) - (ln 1 - ln (6 - 1)) ]
|
| 113 |
+
|
| 114 |
+
Simplify:
|
| 115 |
+
|
| 116 |
+
First, at upper limit 5: ln 5 - ln 1 = ln 5 (since 6 - 5 = 1, ln 1 = 0)
|
| 117 |
+
|
| 118 |
+
At lower limit 1: ln 1 - ln 5 = 0 - ln 5 = -ln 5
|
| 119 |
+
|
| 120 |
+
Therefore, the expression becomes:
|
| 121 |
+
|
| 122 |
+
(1/12) [ (ln 5 - 0) - (0 - ln 5) ] = (1/12)(ln 5 + ln 5) = (1/12)(2 ln 5) = (1/6) ln 5
|
| 123 |
+
|
| 124 |
+
Therefore, the value of the integral is (1/6) ln 5. Let me check the steps again to ensure I didn't make any mistakes.
|
| 125 |
+
|
| 126 |
+
First substitution: t = tan x. That changed variables correctly. The limits from π/4 to arccos(1/√26) became 1 to 5. Then, substitution led to integral in terms of t, simplified to 1/(2t(6 - t)) dt. Partial fractions correctly done, coefficients A and B were both 1/6. Integrated to (1/12)(ln t - ln(6 - t)). Evaluated from 1 to 5, giving (1/12)[ln5 - 0 - (0 - ln5)] = (1/12)(2 ln5) = (1/6) ln5. Seems correct.
|
| 127 |
+
|
| 128 |
+
Alternatively, let me check with another substitution. Suppose instead of t = tan x, maybe another substitution. Let me see.
|
| 129 |
+
|
| 130 |
+
Original integral after simplifying sin2x:
|
| 131 |
+
|
| 132 |
+
(1/2) �� [1/(sinx (6 cosx - sinx))] dx. Let me try substitution u = 6 cosx - sinx.
|
| 133 |
+
|
| 134 |
+
Compute du/dx = -6 sinx - cosx. Hmm, not directly present in the integrand. But the integrand is 1/(sinx * u). Let me see:
|
| 135 |
+
|
| 136 |
+
If I write the integrand as 1/(sinx * u). If I have du = (-6 sinx - cosx) dx. Let me rearrange terms:
|
| 137 |
+
|
| 138 |
+
du = - sinx (6 + cotx) dx. Hmm, not sure. Alternatively, express the integrand in terms of u and du. Let me see:
|
| 139 |
+
|
| 140 |
+
But this seems more complicated. The previous substitution t = tan x worked smoothly, leading to a standard partial fraction integral. So I think the answer is (1/6) ln5. Therefore, the definite integral is (ln5)/6.
|
| 141 |
+
|
| 142 |
+
**Final Answer**
|
| 143 |
+
\boxed{\dfrac{\ln 5}{6}}
|
| 144 |
+
</think>
|
| 145 |
+
|
| 146 |
+
To calculate the definite integral:
|
| 147 |
+
|
| 148 |
+
$$
|
| 149 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6 - \tan x) \sin 2 x}
|
| 150 |
+
$$
|
| 151 |
+
|
| 152 |
+
we start by rewriting \(\sin 2x\) as \(2 \sin x \cos x\). This transforms the integral into:
|
| 153 |
+
|
| 154 |
+
$$
|
| 155 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{1}{(6 - \tan x) \cdot 2 \sin x \cos x} \, dx
|
| 156 |
+
$$
|
| 157 |
+
|
| 158 |
+
Simplifying the denominator using \(\tan x = \frac{\sin x}{\cos x}\), we get:
|
| 159 |
+
|
| 160 |
+
$$
|
| 161 |
+
\frac{1}{2 \sin x \cos x \left(6 - \frac{\sin x}{\cos x}\right)} = \frac{1}{2 \sin x (6 \cos x - \sin x)}
|
| 162 |
+
$$
|
| 163 |
+
|
| 164 |
+
Next, we use the substitution \(t = \tan x\), which gives \(dt = \sec^2 x \, dx\) or \(dx = \frac{dt}{1 + t^2}\). The limits of integration change from \(x = \pi/4\) (where \(t = 1\)) to \(x = \arccos(1/\sqrt{26})\) (where \(t = 5\)). Rewriting \(\sin 2x\) as \(\frac{2t}{1 + t^2}\), the integral becomes:
|
| 165 |
+
|
| 166 |
+
$$
|
| 167 |
+
\int_{1}^{5} \frac{1}{(6 - t) \cdot \frac{2t}{1 + t^2}} \cdot \frac{dt}{1 + t^2}
|
| 168 |
+
$$
|
| 169 |
+
|
| 170 |
+
Simplifying, we get:
|
| 171 |
+
|
| 172 |
+
$$
|
| 173 |
+
\frac{1}{2} \int_{1}^{5} \frac{1}{t(6 - t)} \, dt
|
| 174 |
+
$$
|
| 175 |
+
|
| 176 |
+
Using partial fractions, we decompose \(\frac{1}{t(6 - t)}\) into \(\frac{1}{6t} + \frac{1}{6(6 - t)}\). The integral then becomes:
|
| 177 |
+
|
| 178 |
+
$$
|
| 179 |
+
\frac{1}{12} \int_{1}^{5} \left(\frac{1}{t} + \frac{1}{6 - t}\right) \, dt
|
| 180 |
+
$$
|
| 181 |
+
|
| 182 |
+
Integrating term by term, we get:
|
| 183 |
+
|
| 184 |
+
$$
|
| 185 |
+
\frac{1}{12} \left[ \ln |t| - \ln |6 - t| \right]_{1}^{5}
|
| 186 |
+
$$
|
| 187 |
+
|
| 188 |
+
Evaluating this from 1 to 5:
|
| 189 |
+
|
| 190 |
+
$$
|
| 191 |
+
\frac{1}{12} \left[ (\ln 5 - \ln 1) - (\ln 1 - \ln 5) \right] = \frac{1}{12} (2 \ln 5) = \frac{1}{6} \ln 5
|
| 192 |
+
$$
|
| 193 |
+
|
| 194 |
+
Thus, the value of the integral is:
|
| 195 |
+
|
| 196 |
+
$$
|
| 197 |
+
\boxed{\dfrac{\ln 5}{6}}
|
| 198 |
+
```<|endoftext|>
|
| 199 |
+
|
| 200 |
+
📊 Stats: 80 masked, 2671 trained
|
| 201 |
+
[2025-11-14 10:36:48,937] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 202 |
+
[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
|
| 203 |
+
[93m [WARNING] [0m async_io: please install the libaio-devel package with yum
|
| 204 |
+
[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
| 205 |
+
[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
|
| 206 |
+
[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3
|
| 207 |
+
[93m [WARNING] [0m using untested triton version (2.3.0), only 1.0.0 is known to be compatible
|
| 208 |
+
[2025-11-14 10:36:49,925] [INFO] [comm.py:637:init_distributed] cdb=None
|
| 209 |
+
[2025-11-14 10:36:49,926] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 210 |
+
|
| 211 |
+
============================================================
|
| 212 |
+
🎓 ENTROPY WEIGHTING ENABLED
|
| 213 |
+
============================================================
|
| 214 |
+
Teacher Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 215 |
+
Alpha (α): 2.0
|
| 216 |
+
Beta (β): 0.3
|
| 217 |
+
Teacher dtype: bfloat16
|
| 218 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 219 |
+
============================================================
|
| 220 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 221 |
+
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
| 222 |
+
|
| 223 |
+
🎓 Loading teacher model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 224 |
+
📊 Entropy weighting params: α=2.0, β=0.3
|
| 225 |
+
💾 Entropy computation: top-k=64 (memory-efficient mode)
|
| 226 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 227 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.15s/it]
|
| 228 |
+
✅ Teacher model loaded and frozen
|
| 229 |
+
|
| 230 |
+
🔍 Checking teacher tokenizer/model alignment...
|
| 231 |
+
📌 Teacher tokenizer vocab: 151665
|
| 232 |
+
📌 Teacher model vocab: 152064
|
| 233 |
+
⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...
|
| 234 |
+
✅ Teacher embeddings resized to 151665
|
| 235 |
+
|
| 236 |
+
📊 Student model vocab size: 151936
|
| 237 |
+
📊 Teacher model vocab size (after alignment): 151665
|
| 238 |
+
|
| 239 |
+
⚠️ Student/Teacher vocab size mismatch detected!
|
| 240 |
+
Teacher: 151665
|
| 241 |
+
Student: 151936
|
| 242 |
+
🔧 Resizing student embeddings to match teacher...
|
| 243 |
+
✅ Student embeddings resized to 151665
|
| 244 |
+
New student vocab size: 151665
|
| 245 |
+
|
| 246 |
+
============================================================
|
| 247 |
+
🎯 Final Vocab Alignment Complete
|
| 248 |
+
📊 Teacher vocab size: 151665
|
| 249 |
+
📊 Student vocab size: 151665
|
| 250 |
+
============================================================
|
| 251 |
+
|
| 252 |
+
🏋️ Starting training...
|
| 253 |
+
📊 Total training steps: 5666
|
| 254 |
+
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 255 |
+
Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
|
| 256 |
+
1%| | 25/2834 [04:20<8:16:48, 10.61s/it]
|
| 257 |
+
{'loss': 2.5676, 'grad_norm': 10.754595756530762, 'learning_rate': 2.3255813953488374e-07, 'avg_weight': 0.3920933440327644, 'epoch': 0.0}
|
| 258 |
+
{'loss': 2.4953, 'grad_norm': 10.065147399902344, 'learning_rate': 5.232558139534884e-07, 'avg_weight': 0.389410637319088, 'epoch': 0.0}
|
| 259 |
+
{'loss': 2.5079, 'grad_norm': 8.899886131286621, 'learning_rate': 8.139534883720931e-07, 'avg_weight': 0.38864233046770097, 'epoch': 0.01}
|
| 260 |
+
{'loss': 2.4271, 'grad_norm': 8.81152629852295, 'learning_rate': 1.1046511627906977e-06, 'avg_weight': 0.38897048830986025, 'epoch': 0.01}
|
| 261 |
+
{'loss': 2.4171, 'grad_norm': 7.3060808181762695, 'learning_rate': 1.3953488372093025e-06, 'avg_weight': 0.380949105322361, 'epoch': 0.01}
|
| 262 |
+
33%|███▎ | 38/115 [01:04<02:29, 1.95s/it]
|
train/wandb/run-20251114_103643-cvm4116u/run-cvm4116u.wandb
ADDED
|
Binary file (98.3 kB). View file
|
|
|
train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
|
| 3 |
+
"python": "CPython 3.10.19",
|
| 4 |
+
"startedAt": "2025-11-14T02:36:44.347486Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--model_name",
|
| 7 |
+
"Qwen/Qwen2.5-1.5B",
|
| 8 |
+
"--dataset_path",
|
| 9 |
+
"./datasets/openr1/Openr1-Math-46k-8192.jsonl",
|
| 10 |
+
"--output_dir",
|
| 11 |
+
"./model_sft_save/Qwen2.5-1.5B-Entropy-solution",
|
| 12 |
+
"--batch_size",
|
| 13 |
+
"2",
|
| 14 |
+
"--grad_accum",
|
| 15 |
+
"4",
|
| 16 |
+
"--learning_rate",
|
| 17 |
+
"5e-6",
|
| 18 |
+
"--epochs",
|
| 19 |
+
"1",
|
| 20 |
+
"--use_entropy_weighting",
|
| 21 |
+
"--teacher_model_path",
|
| 22 |
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 23 |
+
"--entropy_weight_alpha",
|
| 24 |
+
"2.0",
|
| 25 |
+
"--entropy_weight_beta",
|
| 26 |
+
"0.3",
|
| 27 |
+
"--teacher_dtype",
|
| 28 |
+
"bfloat16",
|
| 29 |
+
"--use_deepspeed",
|
| 30 |
+
"--deepspeed_config",
|
| 31 |
+
"deepspeed/dp_stage2.json",
|
| 32 |
+
"--use_wandb",
|
| 33 |
+
"--wandb_project",
|
| 34 |
+
"qwen-math-entropy-sft",
|
| 35 |
+
"--wandb_run_name",
|
| 36 |
+
"qwen2.5-1.5b-46k-entropy-solution"
|
| 37 |
+
],
|
| 38 |
+
"program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py",
|
| 39 |
+
"codePath": "train_qwen_46k_weight.py",
|
| 40 |
+
"codePathLocal": "train_qwen_46k_weight.py",
|
| 41 |
+
"email": "yaning1001@gmail.com",
|
| 42 |
+
"root": "/public/home/lshi/yoAI/projects/Online_CL/train",
|
| 43 |
+
"host": "gpu-h100-07",
|
| 44 |
+
"executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
|
| 45 |
+
"cpu_count": 96,
|
| 46 |
+
"cpu_count_logical": 96,
|
| 47 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 48 |
+
"gpu_count": 6,
|
| 49 |
+
"disk": {
|
| 50 |
+
"/": {
|
| 51 |
+
"total": "469407801344",
|
| 52 |
+
"used": "288221097984"
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"memory": {
|
| 56 |
+
"total": "2164142350336"
|
| 57 |
+
},
|
| 58 |
+
"gpu_nvidia": [
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 61 |
+
"memoryTotal": "85520809984",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper",
|
| 64 |
+
"uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 68 |
+
"memoryTotal": "85520809984",
|
| 69 |
+
"cudaCores": 16896,
|
| 70 |
+
"architecture": "Hopper",
|
| 71 |
+
"uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 75 |
+
"memoryTotal": "85520809984",
|
| 76 |
+
"cudaCores": 16896,
|
| 77 |
+
"architecture": "Hopper",
|
| 78 |
+
"uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 82 |
+
"memoryTotal": "85520809984",
|
| 83 |
+
"cudaCores": 16896,
|
| 84 |
+
"architecture": "Hopper",
|
| 85 |
+
"uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 89 |
+
"memoryTotal": "85520809984",
|
| 90 |
+
"cudaCores": 16896,
|
| 91 |
+
"architecture": "Hopper",
|
| 92 |
+
"uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 96 |
+
"memoryTotal": "85520809984",
|
| 97 |
+
"cudaCores": 16896,
|
| 98 |
+
"architecture": "Hopper",
|
| 99 |
+
"uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"cudaVersion": "12.4",
|
| 103 |
+
"slurm": {
|
| 104 |
+
"cluster_name": "cluster_admin1",
|
| 105 |
+
"conf": "/opt/gridview/slurm/etc/slurm.conf",
|
| 106 |
+
"cpu_bind": "quiet,mask_cpu:0x000000000000000000000001",
|
| 107 |
+
"cpu_bind_list": "0x000000000000000000000001",
|
| 108 |
+
"cpu_bind_type": "mask_cpu:",
|
| 109 |
+
"cpu_bind_verbose": "quiet",
|
| 110 |
+
"cpus_on_node": "1",
|
| 111 |
+
"distribution": "cyclic",
|
| 112 |
+
"gtids": "0",
|
| 113 |
+
"home": "/opt/gridview/slurm",
|
| 114 |
+
"job_account": "seu_qli",
|
| 115 |
+
"job_cpus_per_node": "1",
|
| 116 |
+
"job_gid": "2026",
|
| 117 |
+
"job_id": "8428",
|
| 118 |
+
"job_name": "bash",
|
| 119 |
+
"job_nodelist": "gpu-h100-07",
|
| 120 |
+
"job_num_nodes": "1",
|
| 121 |
+
"job_partition": "H100",
|
| 122 |
+
"job_qos": "normal",
|
| 123 |
+
"job_uid": "2019",
|
| 124 |
+
"job_user": "lshi",
|
| 125 |
+
"jobid": "8428",
|
| 126 |
+
"launch_node_ipaddr": "172.16.254.194",
|
| 127 |
+
"localid": "0",
|
| 128 |
+
"nnodes": "1",
|
| 129 |
+
"nodeid": "0",
|
| 130 |
+
"nodelist": "gpu-h100-07",
|
| 131 |
+
"nprocs": "1",
|
| 132 |
+
"ntasks": "1",
|
| 133 |
+
"pmix_direct_conn": "true",
|
| 134 |
+
"pmix_direct_conn_early": "false",
|
| 135 |
+
"pmix_direct_conn_ucx": "false",
|
| 136 |
+
"pmix_timeout": "3000",
|
| 137 |
+
"prio_process": "0",
|
| 138 |
+
"procid": "0",
|
| 139 |
+
"pty_port": "43139",
|
| 140 |
+
"pty_win_col": "146",
|
| 141 |
+
"pty_win_row": "21",
|
| 142 |
+
"srun_comm_host": "172.16.254.194",
|
| 143 |
+
"srun_comm_port": "34989",
|
| 144 |
+
"step_gpus": "0,1,2,3,4,5",
|
| 145 |
+
"step_id": "0",
|
| 146 |
+
"step_launcher_port": "34989",
|
| 147 |
+
"step_nodelist": "gpu-h100-07",
|
| 148 |
+
"step_num_nodes": "1",
|
| 149 |
+
"step_num_tasks": "1",
|
| 150 |
+
"step_tasks_per_node": "1",
|
| 151 |
+
"stepid": "0",
|
| 152 |
+
"submit_dir": "/public/home/lshi/yoAI/projects",
|
| 153 |
+
"submit_host": "admin1",
|
| 154 |
+
"task_pid": "649671",
|
| 155 |
+
"tasks_per_node": "1",
|
| 156 |
+
"topology_addr": "gpu-h100-07",
|
| 157 |
+
"topology_addr_pattern": "node",
|
| 158 |
+
"umask": "0022",
|
| 159 |
+
"working_cluster": "cluster_admin1:172.16.254.194:6817:9216:101"
|
| 160 |
+
},
|
| 161 |
+
"writerId": "by9i30haenoy72kbuz6wpiu3fbxqgmd9"
|
| 162 |
+
}
|
train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":382},"_runtime":382}
|
train/wandb/run-20251114_145219-w9xre5r3/files/config.yaml
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_name_or_path:
|
| 2 |
+
value: Qwen/Qwen2.5-1.5B
|
| 3 |
+
_wandb:
|
| 4 |
+
value:
|
| 5 |
+
cli_version: 0.22.3
|
| 6 |
+
e:
|
| 7 |
+
4ns8zmo5ar2v4v1bdwhe5waa6417g92a:
|
| 8 |
+
args:
|
| 9 |
+
- --model_name
|
| 10 |
+
- Qwen/Qwen2.5-1.5B
|
| 11 |
+
- --dataset_path
|
| 12 |
+
- ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 13 |
+
- --output_dir
|
| 14 |
+
- ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 15 |
+
- --batch_size
|
| 16 |
+
- "2"
|
| 17 |
+
- --grad_accum
|
| 18 |
+
- "4"
|
| 19 |
+
- --learning_rate
|
| 20 |
+
- "5e-6"
|
| 21 |
+
- --epochs
|
| 22 |
+
- "1"
|
| 23 |
+
- --use_entropy_weighting
|
| 24 |
+
- --teacher_model_path
|
| 25 |
+
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 26 |
+
- --entropy_weight_alpha
|
| 27 |
+
- "2.0"
|
| 28 |
+
- --entropy_weight_beta
|
| 29 |
+
- "0.3"
|
| 30 |
+
- --teacher_dtype
|
| 31 |
+
- bfloat16
|
| 32 |
+
- --entropy_top_k
|
| 33 |
+
- "48"
|
| 34 |
+
- --teacher_device_ids
|
| 35 |
+
- "2"
|
| 36 |
+
- --use_deepspeed
|
| 37 |
+
- --deepspeed_config
|
| 38 |
+
- deepspeed/dp_stage2.json
|
| 39 |
+
- --use_wandb
|
| 40 |
+
- --wandb_project
|
| 41 |
+
- qwen-math-entropy-sft
|
| 42 |
+
- --wandb_run_name
|
| 43 |
+
- qwen2.5-1.5b-46k-entropy-solution
|
| 44 |
+
codePath: train_qwen_46k_weight.py
|
| 45 |
+
codePathLocal: train_qwen_46k_weight.py
|
| 46 |
+
cpu_count: 96
|
| 47 |
+
cpu_count_logical: 96
|
| 48 |
+
cudaVersion: "12.4"
|
| 49 |
+
disk:
|
| 50 |
+
/:
|
| 51 |
+
total: "469407801344"
|
| 52 |
+
used: "288248733696"
|
| 53 |
+
email: yaning1001@gmail.com
|
| 54 |
+
executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
|
| 55 |
+
gpu: NVIDIA H100 80GB HBM3
|
| 56 |
+
gpu_count: 8
|
| 57 |
+
gpu_nvidia:
|
| 58 |
+
- architecture: Hopper
|
| 59 |
+
cudaCores: 16896
|
| 60 |
+
memoryTotal: "85520809984"
|
| 61 |
+
name: NVIDIA H100 80GB HBM3
|
| 62 |
+
uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
|
| 63 |
+
- architecture: Hopper
|
| 64 |
+
cudaCores: 16896
|
| 65 |
+
memoryTotal: "85520809984"
|
| 66 |
+
name: NVIDIA H100 80GB HBM3
|
| 67 |
+
uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
|
| 68 |
+
- architecture: Hopper
|
| 69 |
+
cudaCores: 16896
|
| 70 |
+
memoryTotal: "85520809984"
|
| 71 |
+
name: NVIDIA H100 80GB HBM3
|
| 72 |
+
uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
|
| 73 |
+
- architecture: Hopper
|
| 74 |
+
cudaCores: 16896
|
| 75 |
+
memoryTotal: "85520809984"
|
| 76 |
+
name: NVIDIA H100 80GB HBM3
|
| 77 |
+
uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
|
| 78 |
+
- architecture: Hopper
|
| 79 |
+
cudaCores: 16896
|
| 80 |
+
memoryTotal: "85520809984"
|
| 81 |
+
name: NVIDIA H100 80GB HBM3
|
| 82 |
+
uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
|
| 83 |
+
- architecture: Hopper
|
| 84 |
+
cudaCores: 16896
|
| 85 |
+
memoryTotal: "85520809984"
|
| 86 |
+
name: NVIDIA H100 80GB HBM3
|
| 87 |
+
uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
|
| 88 |
+
- architecture: Hopper
|
| 89 |
+
cudaCores: 16896
|
| 90 |
+
memoryTotal: "85520809984"
|
| 91 |
+
name: NVIDIA H100 80GB HBM3
|
| 92 |
+
uuid: GPU-23628f74-fede-6431-ae15-2764fce29130
|
| 93 |
+
- architecture: Hopper
|
| 94 |
+
cudaCores: 16896
|
| 95 |
+
memoryTotal: "85520809984"
|
| 96 |
+
name: NVIDIA H100 80GB HBM3
|
| 97 |
+
uuid: GPU-d18d570f-dd0f-0ff6-3401-561c9e799136
|
| 98 |
+
host: gpu-h100-07
|
| 99 |
+
memory:
|
| 100 |
+
total: "2164142350336"
|
| 101 |
+
os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
|
| 102 |
+
program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
|
| 103 |
+
python: CPython 3.10.19
|
| 104 |
+
root: /public/home/lshi/yoAI/projects/Online_CL/train
|
| 105 |
+
slurm:
|
| 106 |
+
home: /opt/gridview/slurm
|
| 107 |
+
pmix_direct_conn: "true"
|
| 108 |
+
pmix_direct_conn_early: "false"
|
| 109 |
+
pmix_direct_conn_ucx: "false"
|
| 110 |
+
pmix_timeout: "3000"
|
| 111 |
+
startedAt: "2025-11-14T06:52:19.650156Z"
|
| 112 |
+
writerId: 4ns8zmo5ar2v4v1bdwhe5waa6417g92a
|
| 113 |
+
m:
|
| 114 |
+
- "1": train/global_step
|
| 115 |
+
"6":
|
| 116 |
+
- 3
|
| 117 |
+
"7": []
|
| 118 |
+
- "2": '*'
|
| 119 |
+
"5": 1
|
| 120 |
+
"6":
|
| 121 |
+
- 1
|
| 122 |
+
"7": []
|
| 123 |
+
python_version: 3.10.19
|
| 124 |
+
t:
|
| 125 |
+
"1":
|
| 126 |
+
- 1
|
| 127 |
+
- 11
|
| 128 |
+
- 41
|
| 129 |
+
- 49
|
| 130 |
+
- 51
|
| 131 |
+
- 71
|
| 132 |
+
- 84
|
| 133 |
+
- 98
|
| 134 |
+
- 105
|
| 135 |
+
"2":
|
| 136 |
+
- 1
|
| 137 |
+
- 11
|
| 138 |
+
- 41
|
| 139 |
+
- 49
|
| 140 |
+
- 51
|
| 141 |
+
- 71
|
| 142 |
+
- 84
|
| 143 |
+
- 98
|
| 144 |
+
- 105
|
| 145 |
+
"3":
|
| 146 |
+
- 7
|
| 147 |
+
- 13
|
| 148 |
+
- 19
|
| 149 |
+
- 66
|
| 150 |
+
"4": 3.10.19
|
| 151 |
+
"5": 0.22.3
|
| 152 |
+
"6": 4.57.1
|
| 153 |
+
"9":
|
| 154 |
+
"1": transformers_trainer
|
| 155 |
+
"10":
|
| 156 |
+
- 20
|
| 157 |
+
"12": 0.22.3
|
| 158 |
+
"13": linux-x86_64
|
| 159 |
+
accelerator_config:
|
| 160 |
+
value:
|
| 161 |
+
dispatch_batches: null
|
| 162 |
+
even_batches: true
|
| 163 |
+
gradient_accumulation_kwargs: null
|
| 164 |
+
non_blocking: false
|
| 165 |
+
split_batches: false
|
| 166 |
+
use_seedable_sampler: true
|
| 167 |
+
activation_offloading:
|
| 168 |
+
value: false
|
| 169 |
+
adafactor:
|
| 170 |
+
value: false
|
| 171 |
+
adam_beta1:
|
| 172 |
+
value: 0.9
|
| 173 |
+
adam_beta2:
|
| 174 |
+
value: 0.999
|
| 175 |
+
adam_epsilon:
|
| 176 |
+
value: 1e-08
|
| 177 |
+
add_cross_attention:
|
| 178 |
+
value: false
|
| 179 |
+
architectures:
|
| 180 |
+
value:
|
| 181 |
+
- Qwen2ForCausalLM
|
| 182 |
+
assistant_only_loss:
|
| 183 |
+
value: false
|
| 184 |
+
attention_dropout:
|
| 185 |
+
value: 0
|
| 186 |
+
auto_find_batch_size:
|
| 187 |
+
value: false
|
| 188 |
+
average_tokens_across_devices:
|
| 189 |
+
value: true
|
| 190 |
+
bad_words_ids:
|
| 191 |
+
value: null
|
| 192 |
+
batch_eval_metrics:
|
| 193 |
+
value: false
|
| 194 |
+
begin_suppress_tokens:
|
| 195 |
+
value: null
|
| 196 |
+
bf16:
|
| 197 |
+
value: true
|
| 198 |
+
bf16_full_eval:
|
| 199 |
+
value: false
|
| 200 |
+
bos_token_id:
|
| 201 |
+
value: null
|
| 202 |
+
chat_template_path:
|
| 203 |
+
value: null
|
| 204 |
+
chunk_size_feed_forward:
|
| 205 |
+
value: 0
|
| 206 |
+
completion_only_loss:
|
| 207 |
+
value: null
|
| 208 |
+
cross_attention_hidden_size:
|
| 209 |
+
value: null
|
| 210 |
+
data_seed:
|
| 211 |
+
value: null
|
| 212 |
+
dataloader_drop_last:
|
| 213 |
+
value: false
|
| 214 |
+
dataloader_num_workers:
|
| 215 |
+
value: 0
|
| 216 |
+
dataloader_persistent_workers:
|
| 217 |
+
value: false
|
| 218 |
+
dataloader_pin_memory:
|
| 219 |
+
value: true
|
| 220 |
+
dataloader_prefetch_factor:
|
| 221 |
+
value: null
|
| 222 |
+
dataset_kwargs:
|
| 223 |
+
value: null
|
| 224 |
+
dataset_num_proc:
|
| 225 |
+
value: null
|
| 226 |
+
dataset_text_field:
|
| 227 |
+
value: null
|
| 228 |
+
ddp_backend:
|
| 229 |
+
value: null
|
| 230 |
+
ddp_broadcast_buffers:
|
| 231 |
+
value: null
|
| 232 |
+
ddp_bucket_cap_mb:
|
| 233 |
+
value: null
|
| 234 |
+
ddp_find_unused_parameters:
|
| 235 |
+
value: null
|
| 236 |
+
ddp_timeout:
|
| 237 |
+
value: 1800
|
| 238 |
+
debug:
|
| 239 |
+
value: []
|
| 240 |
+
decoder_start_token_id:
|
| 241 |
+
value: null
|
| 242 |
+
deepspeed:
|
| 243 |
+
value: deepspeed/dp_stage2.json
|
| 244 |
+
disable_tqdm:
|
| 245 |
+
value: false
|
| 246 |
+
diversity_penalty:
|
| 247 |
+
value: 0
|
| 248 |
+
do_eval:
|
| 249 |
+
value: true
|
| 250 |
+
do_predict:
|
| 251 |
+
value: false
|
| 252 |
+
do_sample:
|
| 253 |
+
value: false
|
| 254 |
+
do_train:
|
| 255 |
+
value: false
|
| 256 |
+
dtype:
|
| 257 |
+
value: bfloat16
|
| 258 |
+
early_stopping:
|
| 259 |
+
value: false
|
| 260 |
+
encoder_no_repeat_ngram_size:
|
| 261 |
+
value: 0
|
| 262 |
+
entropy_top_k:
|
| 263 |
+
value: 48
|
| 264 |
+
entropy_weight_alpha:
|
| 265 |
+
value: 2
|
| 266 |
+
entropy_weight_beta:
|
| 267 |
+
value: 0.3
|
| 268 |
+
eos_token:
|
| 269 |
+
value: <EOS_TOKEN>
|
| 270 |
+
eos_token_id:
|
| 271 |
+
value: 151643
|
| 272 |
+
eval_accumulation_steps:
|
| 273 |
+
value: null
|
| 274 |
+
eval_delay:
|
| 275 |
+
value: 0
|
| 276 |
+
eval_do_concat_batches:
|
| 277 |
+
value: true
|
| 278 |
+
eval_on_start:
|
| 279 |
+
value: false
|
| 280 |
+
eval_packing:
|
| 281 |
+
value: null
|
| 282 |
+
eval_steps:
|
| 283 |
+
value: 25
|
| 284 |
+
eval_strategy:
|
| 285 |
+
value: steps
|
| 286 |
+
eval_use_gather_object:
|
| 287 |
+
value: false
|
| 288 |
+
exponential_decay_length_penalty:
|
| 289 |
+
value: null
|
| 290 |
+
finetuning_task:
|
| 291 |
+
value: null
|
| 292 |
+
forced_bos_token_id:
|
| 293 |
+
value: null
|
| 294 |
+
forced_eos_token_id:
|
| 295 |
+
value: null
|
| 296 |
+
fp16:
|
| 297 |
+
value: false
|
| 298 |
+
fp16_backend:
|
| 299 |
+
value: auto
|
| 300 |
+
fp16_full_eval:
|
| 301 |
+
value: false
|
| 302 |
+
fp16_opt_level:
|
| 303 |
+
value: O1
|
| 304 |
+
fsdp:
|
| 305 |
+
value: []
|
| 306 |
+
fsdp_config:
|
| 307 |
+
value:
|
| 308 |
+
min_num_params: 0
|
| 309 |
+
xla: false
|
| 310 |
+
xla_fsdp_grad_ckpt: false
|
| 311 |
+
xla_fsdp_v2: false
|
| 312 |
+
fsdp_min_num_params:
|
| 313 |
+
value: 0
|
| 314 |
+
fsdp_transformer_layer_cls_to_wrap:
|
| 315 |
+
value: null
|
| 316 |
+
full_determinism:
|
| 317 |
+
value: false
|
| 318 |
+
gradient_accumulation_steps:
|
| 319 |
+
value: 4
|
| 320 |
+
gradient_checkpointing:
|
| 321 |
+
value: true
|
| 322 |
+
gradient_checkpointing_kwargs:
|
| 323 |
+
value:
|
| 324 |
+
use_reentrant: false
|
| 325 |
+
greater_is_better:
|
| 326 |
+
value: false
|
| 327 |
+
group_by_length:
|
| 328 |
+
value: false
|
| 329 |
+
half_precision_backend:
|
| 330 |
+
value: auto
|
| 331 |
+
hidden_act:
|
| 332 |
+
value: silu
|
| 333 |
+
hidden_size:
|
| 334 |
+
value: 1536
|
| 335 |
+
hub_always_push:
|
| 336 |
+
value: false
|
| 337 |
+
hub_model_id:
|
| 338 |
+
value: null
|
| 339 |
+
hub_private_repo:
|
| 340 |
+
value: null
|
| 341 |
+
hub_revision:
|
| 342 |
+
value: null
|
| 343 |
+
hub_strategy:
|
| 344 |
+
value: every_save
|
| 345 |
+
hub_token:
|
| 346 |
+
value: <HUB_TOKEN>
|
| 347 |
+
id2label:
|
| 348 |
+
value:
|
| 349 |
+
"0": LABEL_0
|
| 350 |
+
"1": LABEL_1
|
| 351 |
+
ignore_data_skip:
|
| 352 |
+
value: false
|
| 353 |
+
include_for_metrics:
|
| 354 |
+
value: []
|
| 355 |
+
include_inputs_for_metrics:
|
| 356 |
+
value: false
|
| 357 |
+
include_num_input_tokens_seen:
|
| 358 |
+
value: "no"
|
| 359 |
+
include_tokens_per_second:
|
| 360 |
+
value: false
|
| 361 |
+
initializer_range:
|
| 362 |
+
value: 0.02
|
| 363 |
+
intermediate_size:
|
| 364 |
+
value: 8960
|
| 365 |
+
is_decoder:
|
| 366 |
+
value: false
|
| 367 |
+
is_encoder_decoder:
|
| 368 |
+
value: false
|
| 369 |
+
jit_mode_eval:
|
| 370 |
+
value: false
|
| 371 |
+
label_names:
|
| 372 |
+
value: null
|
| 373 |
+
label_smoothing_factor:
|
| 374 |
+
value: 0
|
| 375 |
+
label2id:
|
| 376 |
+
value:
|
| 377 |
+
LABEL_0: 0
|
| 378 |
+
LABEL_1: 1
|
| 379 |
+
layer_types:
|
| 380 |
+
value:
|
| 381 |
+
- full_attention
|
| 382 |
+
- full_attention
|
| 383 |
+
- full_attention
|
| 384 |
+
- full_attention
|
| 385 |
+
- full_attention
|
| 386 |
+
- full_attention
|
| 387 |
+
- full_attention
|
| 388 |
+
- full_attention
|
| 389 |
+
- full_attention
|
| 390 |
+
- full_attention
|
| 391 |
+
- full_attention
|
| 392 |
+
- full_attention
|
| 393 |
+
- full_attention
|
| 394 |
+
- full_attention
|
| 395 |
+
- full_attention
|
| 396 |
+
- full_attention
|
| 397 |
+
- full_attention
|
| 398 |
+
- full_attention
|
| 399 |
+
- full_attention
|
| 400 |
+
- full_attention
|
| 401 |
+
- full_attention
|
| 402 |
+
- full_attention
|
| 403 |
+
- full_attention
|
| 404 |
+
- full_attention
|
| 405 |
+
- full_attention
|
| 406 |
+
- full_attention
|
| 407 |
+
- full_attention
|
| 408 |
+
- full_attention
|
| 409 |
+
learning_rate:
|
| 410 |
+
value: 5e-06
|
| 411 |
+
length_column_name:
|
| 412 |
+
value: length
|
| 413 |
+
length_penalty:
|
| 414 |
+
value: 1
|
| 415 |
+
liger_kernel_config:
|
| 416 |
+
value: null
|
| 417 |
+
load_best_model_at_end:
|
| 418 |
+
value: true
|
| 419 |
+
local_rank:
|
| 420 |
+
value: 0
|
| 421 |
+
log_level:
|
| 422 |
+
value: passive
|
| 423 |
+
log_level_replica:
|
| 424 |
+
value: warning
|
| 425 |
+
log_on_each_node:
|
| 426 |
+
value: true
|
| 427 |
+
logging_dir:
|
| 428 |
+
value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_14-52-26_gpu-h100-07
|
| 429 |
+
logging_first_step:
|
| 430 |
+
value: false
|
| 431 |
+
logging_nan_inf_filter:
|
| 432 |
+
value: true
|
| 433 |
+
logging_steps:
|
| 434 |
+
value: 5
|
| 435 |
+
logging_strategy:
|
| 436 |
+
value: steps
|
| 437 |
+
loss_type:
|
| 438 |
+
value: nll
|
| 439 |
+
lr_scheduler_type:
|
| 440 |
+
value: cosine
|
| 441 |
+
max_grad_norm:
|
| 442 |
+
value: 1
|
| 443 |
+
max_length:
|
| 444 |
+
value: 8192
|
| 445 |
+
max_position_embeddings:
|
| 446 |
+
value: 131072
|
| 447 |
+
max_steps:
|
| 448 |
+
value: -1
|
| 449 |
+
max_window_layers:
|
| 450 |
+
value: 28
|
| 451 |
+
metric_for_best_model:
|
| 452 |
+
value: eval_loss
|
| 453 |
+
min_length:
|
| 454 |
+
value: 0
|
| 455 |
+
model/num_parameters:
|
| 456 |
+
value: 1543298048
|
| 457 |
+
model_init_kwargs:
|
| 458 |
+
value: null
|
| 459 |
+
model_type:
|
| 460 |
+
value: qwen2
|
| 461 |
+
mp_parameters:
|
| 462 |
+
value: ""
|
| 463 |
+
neftune_noise_alpha:
|
| 464 |
+
value: null
|
| 465 |
+
no_cuda:
|
| 466 |
+
value: false
|
| 467 |
+
no_repeat_ngram_size:
|
| 468 |
+
value: 0
|
| 469 |
+
num_attention_heads:
|
| 470 |
+
value: 12
|
| 471 |
+
num_beam_groups:
|
| 472 |
+
value: 1
|
| 473 |
+
num_beams:
|
| 474 |
+
value: 1
|
| 475 |
+
num_hidden_layers:
|
| 476 |
+
value: 28
|
| 477 |
+
num_key_value_heads:
|
| 478 |
+
value: 2
|
| 479 |
+
num_return_sequences:
|
| 480 |
+
value: 1
|
| 481 |
+
num_train_epochs:
|
| 482 |
+
value: 1
|
| 483 |
+
optim:
|
| 484 |
+
value: adamw_torch
|
| 485 |
+
optim_args:
|
| 486 |
+
value: null
|
| 487 |
+
optim_target_modules:
|
| 488 |
+
value: null
|
| 489 |
+
output_attentions:
|
| 490 |
+
value: false
|
| 491 |
+
output_dir:
|
| 492 |
+
value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 493 |
+
output_hidden_states:
|
| 494 |
+
value: false
|
| 495 |
+
output_scores:
|
| 496 |
+
value: false
|
| 497 |
+
overwrite_output_dir:
|
| 498 |
+
value: false
|
| 499 |
+
packing:
|
| 500 |
+
value: false
|
| 501 |
+
packing_strategy:
|
| 502 |
+
value: bfd
|
| 503 |
+
pad_to_multiple_of:
|
| 504 |
+
value: null
|
| 505 |
+
pad_token:
|
| 506 |
+
value: <PAD_TOKEN>
|
| 507 |
+
pad_token_id:
|
| 508 |
+
value: 151643
|
| 509 |
+
padding_free:
|
| 510 |
+
value: false
|
| 511 |
+
parallelism_config:
|
| 512 |
+
value: null
|
| 513 |
+
past_index:
|
| 514 |
+
value: -1
|
| 515 |
+
per_device_eval_batch_size:
|
| 516 |
+
value: 2
|
| 517 |
+
per_device_train_batch_size:
|
| 518 |
+
value: 2
|
| 519 |
+
per_gpu_eval_batch_size:
|
| 520 |
+
value: null
|
| 521 |
+
per_gpu_train_batch_size:
|
| 522 |
+
value: null
|
| 523 |
+
prediction_loss_only:
|
| 524 |
+
value: false
|
| 525 |
+
prefix:
|
| 526 |
+
value: null
|
| 527 |
+
problem_type:
|
| 528 |
+
value: null
|
| 529 |
+
project:
|
| 530 |
+
value: huggingface
|
| 531 |
+
push_to_hub:
|
| 532 |
+
value: false
|
| 533 |
+
push_to_hub_model_id:
|
| 534 |
+
value: null
|
| 535 |
+
push_to_hub_organization:
|
| 536 |
+
value: null
|
| 537 |
+
push_to_hub_token:
|
| 538 |
+
value: <PUSH_TO_HUB_TOKEN>
|
| 539 |
+
ray_scope:
|
| 540 |
+
value: last
|
| 541 |
+
remove_invalid_values:
|
| 542 |
+
value: false
|
| 543 |
+
remove_unused_columns:
|
| 544 |
+
value: true
|
| 545 |
+
repetition_penalty:
|
| 546 |
+
value: 1
|
| 547 |
+
report_to:
|
| 548 |
+
value:
|
| 549 |
+
- wandb
|
| 550 |
+
restore_callback_states_from_checkpoint:
|
| 551 |
+
value: false
|
| 552 |
+
resume_from_checkpoint:
|
| 553 |
+
value: null
|
| 554 |
+
return_dict:
|
| 555 |
+
value: true
|
| 556 |
+
return_dict_in_generate:
|
| 557 |
+
value: false
|
| 558 |
+
rms_norm_eps:
|
| 559 |
+
value: 1e-06
|
| 560 |
+
rope_scaling:
|
| 561 |
+
value: null
|
| 562 |
+
rope_theta:
|
| 563 |
+
value: 1e+06
|
| 564 |
+
run_name:
|
| 565 |
+
value: qwen2.5-1.5b-46k-entropy-solution
|
| 566 |
+
save_on_each_node:
|
| 567 |
+
value: false
|
| 568 |
+
save_only_model:
|
| 569 |
+
value: false
|
| 570 |
+
save_safetensors:
|
| 571 |
+
value: true
|
| 572 |
+
save_steps:
|
| 573 |
+
value: 50
|
| 574 |
+
save_strategy:
|
| 575 |
+
value: steps
|
| 576 |
+
save_total_limit:
|
| 577 |
+
value: 2
|
| 578 |
+
seed:
|
| 579 |
+
value: 42
|
| 580 |
+
sep_token_id:
|
| 581 |
+
value: null
|
| 582 |
+
skip_memory_metrics:
|
| 583 |
+
value: true
|
| 584 |
+
sliding_window:
|
| 585 |
+
value: null
|
| 586 |
+
suppress_tokens:
|
| 587 |
+
value: null
|
| 588 |
+
task_specific_params:
|
| 589 |
+
value: null
|
| 590 |
+
teacher_device_ids:
|
| 591 |
+
value: "2"
|
| 592 |
+
teacher_dtype:
|
| 593 |
+
value: bfloat16
|
| 594 |
+
teacher_model_path:
|
| 595 |
+
value: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 596 |
+
temperature:
|
| 597 |
+
value: 1
|
| 598 |
+
tf_legacy_loss:
|
| 599 |
+
value: false
|
| 600 |
+
tf32:
|
| 601 |
+
value: null
|
| 602 |
+
tie_encoder_decoder:
|
| 603 |
+
value: false
|
| 604 |
+
tie_word_embeddings:
|
| 605 |
+
value: true
|
| 606 |
+
tokenizer_class:
|
| 607 |
+
value: null
|
| 608 |
+
top_k:
|
| 609 |
+
value: 50
|
| 610 |
+
top_p:
|
| 611 |
+
value: 1
|
| 612 |
+
torch_compile:
|
| 613 |
+
value: false
|
| 614 |
+
torch_compile_backend:
|
| 615 |
+
value: null
|
| 616 |
+
torch_compile_mode:
|
| 617 |
+
value: null
|
| 618 |
+
torch_empty_cache_steps:
|
| 619 |
+
value: null
|
| 620 |
+
torchdynamo:
|
| 621 |
+
value: null
|
| 622 |
+
torchscript:
|
| 623 |
+
value: false
|
| 624 |
+
tpu_metrics_debug:
|
| 625 |
+
value: false
|
| 626 |
+
tpu_num_cores:
|
| 627 |
+
value: null
|
| 628 |
+
trackio_space_id:
|
| 629 |
+
value: trackio
|
| 630 |
+
transformers_version:
|
| 631 |
+
value: 4.57.1
|
| 632 |
+
typical_p:
|
| 633 |
+
value: 1
|
| 634 |
+
use_bfloat16:
|
| 635 |
+
value: false
|
| 636 |
+
use_cache:
|
| 637 |
+
value: true
|
| 638 |
+
use_cpu:
|
| 639 |
+
value: false
|
| 640 |
+
use_entropy_weighting:
|
| 641 |
+
value: true
|
| 642 |
+
use_legacy_prediction_loop:
|
| 643 |
+
value: false
|
| 644 |
+
use_liger_kernel:
|
| 645 |
+
value: false
|
| 646 |
+
use_mps_device:
|
| 647 |
+
value: false
|
| 648 |
+
use_mrope:
|
| 649 |
+
value: false
|
| 650 |
+
use_sliding_window:
|
| 651 |
+
value: false
|
| 652 |
+
vocab_size:
|
| 653 |
+
value: 151665
|
| 654 |
+
warmup_ratio:
|
| 655 |
+
value: 0.03
|
| 656 |
+
warmup_steps:
|
| 657 |
+
value: 0
|
| 658 |
+
weight_decay:
|
| 659 |
+
value: 0.01
|
train/wandb/run-20251114_145219-w9xre5r3/files/output.log
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
📝 Loading tokenizer...
|
| 2 |
+
📦 Loading local dataset from: ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 3 |
+
📊 Train: 45334 | Eval: 458
|
| 4 |
+
🔄 Formatting dataset...
|
| 5 |
+
|
| 6 |
+
🔍 MASKED PART (prompt/question):
|
| 7 |
+
<|im_start|>system
|
| 8 |
+
Think step by step and solve the problem.<|im_end|>
|
| 9 |
+
<|im_start|>user
|
| 10 |
+
## Problem Statement
|
| 11 |
+
|
| 12 |
+
Calculate the definite integral:
|
| 13 |
+
|
| 14 |
+
$$
|
| 15 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6-\operatorname{tg} x) \sin 2 x}
|
| 16 |
+
$$<|im_end|>
|
| 17 |
+
<|im_start|>assistant
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
✅ TRAINED PART (solution):
|
| 21 |
+
<think>
|
| 22 |
+
Okay, so I need to compute the definite integral from π/4 to arccos(1/√26) of dx divided by (6 - tan x) times sin 2x. Hmm, let me start by recalling some integration techniques. The integral has tan x and sin 2x in the denominator. Maybe I can simplify the expression first or use substitution.
|
| 23 |
+
|
| 24 |
+
First, I know that sin 2x is equal to 2 sin x cos x. So, maybe rewriting sin 2x as 2 sin x cos x could help. Let me try that:
|
| 25 |
+
|
| 26 |
+
∫ [1 / ( (6 - tan x) * 2 sin x cos x ) ] dx
|
| 27 |
+
|
| 28 |
+
Simplifying the denominator, that becomes:
|
| 29 |
+
|
| 30 |
+
1 / [2 sin x cos x (6 - tan x)] dx
|
| 31 |
+
|
| 32 |
+
But tan x is sin x / cos x, so substituting that in:
|
| 33 |
+
|
| 34 |
+
Denominator: 2 sin x cos x (6 - sin x / cos x )
|
| 35 |
+
|
| 36 |
+
Let me combine the terms in the parenthesis:
|
| 37 |
+
|
| 38 |
+
6 - sin x / cos x = (6 cos x - sin x)/cos x
|
| 39 |
+
|
| 40 |
+
Therefore, the denominator becomes:
|
| 41 |
+
|
| 42 |
+
2 sin x cos x * (6 cos x - sin x)/cos x = 2 sin x (6 cos x - sin x)
|
| 43 |
+
|
| 44 |
+
So now, the integral simplifies to:
|
| 45 |
+
|
| 46 |
+
∫ [1 / (2 sin x (6 cos x - sin x)) ] dx
|
| 47 |
+
|
| 48 |
+
So the integral is now 1/(2 sin x (6 cos x - sin x)) dx. Maybe this is easier to integrate. Let me factor out the 1/2:
|
| 49 |
+
|
| 50 |
+
(1/2) �� [1 / (sin x (6 cos x - sin x)) ] dx
|
| 51 |
+
|
| 52 |
+
Hmm, this seems a bit complicated. Maybe I can use substitution here. Let me think about substitution. Let's let u = 6 cos x - sin x. Then, du/dx = -6 sin x - cos x. Hmm, not sure if that helps. Let's check:
|
| 53 |
+
|
| 54 |
+
If u = 6 cos x - sin x, then du = (-6 sin x - cos x) dx. Hmm, the integral has 1/(sin x u). So, maybe if I can express the integral in terms of du and u. But du has terms -6 sin x - cos x, which isn't directly present in the integral. Maybe this isn't the right substitution.
|
| 55 |
+
|
| 56 |
+
Alternatively, maybe split the fraction into partial fractions. Let me see. The denominator is sin x (6 cos x - sin x). Let me denote t = cos x or t = sin x. Maybe substitution t = sin x or t = cos x.
|
| 57 |
+
|
| 58 |
+
Alternatively, use substitution t = tan x. Let's try that. Let t = tan x. Then, dt/dx = sec²x = 1 + tan²x. So, dx = dt / (1 + t²). Also, sin 2x = 2 tan x / (1 + tan²x) = 2t / (1 + t²). Let me rewrite the integral in terms of t.
|
| 59 |
+
|
| 60 |
+
Original integral:
|
| 61 |
+
|
| 62 |
+
∫ [1 / ( (6 - t) * (2t / (1 + t²)) ) ] * [dt / (1 + t²)]
|
| 63 |
+
|
| 64 |
+
Wait, let's check. If t = tan x, then when x goes from π/4 to arccos(1/√26), t will go from tan(π/4) = 1 to tan(arccos(1/√26)). Let me compute tan(arccos(1/√26)). Let �� = arccos(1/√26). So cos �� = 1/√26, so sin �� = sqrt(1 - 1/26) = sqrt(25/26) = 5/√26. Therefore, tan �� = sin �� / cos �� = 5. So the upper limit is 5. So substitution t = tan x changes the limits from 1 to 5.
|
| 65 |
+
|
| 66 |
+
Now, let's rewrite the integral. The integrand is 1 / [ (6 - tan x) sin 2x ] dx.
|
| 67 |
+
|
| 68 |
+
Expressing in terms of t:
|
| 69 |
+
|
| 70 |
+
1 / [ (6 - t) * (2t / (1 + t²)) ] * (dt / (1 + t²))
|
| 71 |
+
|
| 72 |
+
Let me compute that step by step. First, sin 2x is 2t / (1 + t²). So, the denominator becomes (6 - t) * 2t / (1 + t²). Then, dx is dt / (1 + t²). So, multiplying all together:
|
| 73 |
+
|
| 74 |
+
Integral becomes �� [ (1 + t²) / (2t (6 - t)) ] * [ 1 / (1 + t²) ] dt from t=1 to t=5.
|
| 75 |
+
|
| 76 |
+
Simplify: The (1 + t²) cancels out. So, we have �� [1 / (2t (6 - t)) ] dt from 1 to 5.
|
| 77 |
+
|
| 78 |
+
So, the integral simplifies to (1/2) �� [1 / (t (6 - t)) ] dt from 1 to 5.
|
| 79 |
+
|
| 80 |
+
That's a much simpler integral. Now, let's compute �� [1 / (t (6 - t)) ] dt. We can use partial fractions here.
|
| 81 |
+
|
| 82 |
+
Express 1 / [ t (6 - t) ] as A/t + B/(6 - t). Let's find A and B.
|
| 83 |
+
|
| 84 |
+
1 = A(6 - t) + B t
|
| 85 |
+
|
| 86 |
+
Let t = 0: 1 = 6A => A = 1/6
|
| 87 |
+
|
| 88 |
+
Let t = 6: 1 = 6B => B = 1/6
|
| 89 |
+
|
| 90 |
+
Therefore, 1 / [ t (6 - t) ] = (1/6)/t + (1/6)/(6 - t)
|
| 91 |
+
|
| 92 |
+
Therefore, the integral becomes:
|
| 93 |
+
|
| 94 |
+
(1/2) �� [ (1/6)/t + (1/6)/(6 - t) ] dt from 1 to 5
|
| 95 |
+
|
| 96 |
+
Factor out 1/6:
|
| 97 |
+
|
| 98 |
+
(1/2)(1/6) �� [1/t + 1/(6 - t) ] dt = (1/12) �� [1/t + 1/(6 - t) ] dt
|
| 99 |
+
|
| 100 |
+
Integrate term by term:
|
| 101 |
+
|
| 102 |
+
∫1/t dt = ln |t| + C
|
| 103 |
+
|
| 104 |
+
∫1/(6 - t) dt = -ln |6 - t| + C
|
| 105 |
+
|
| 106 |
+
Therefore, the integral becomes:
|
| 107 |
+
|
| 108 |
+
(1/12) [ ln |t| - ln |6 - t| ] evaluated from 1 to 5.
|
| 109 |
+
|
| 110 |
+
So, evaluating from 1 to 5:
|
| 111 |
+
|
| 112 |
+
(1/12) [ (ln 5 - ln (6 - 5)) - (ln 1 - ln (6 - 1)) ]
|
| 113 |
+
|
| 114 |
+
Simplify:
|
| 115 |
+
|
| 116 |
+
First, at upper limit 5: ln 5 - ln 1 = ln 5 (since 6 - 5 = 1, ln 1 = 0)
|
| 117 |
+
|
| 118 |
+
At lower limit 1: ln 1 - ln 5 = 0 - ln 5 = -ln 5
|
| 119 |
+
|
| 120 |
+
Therefore, the expression becomes:
|
| 121 |
+
|
| 122 |
+
(1/12) [ (ln 5 - 0) - (0 - ln 5) ] = (1/12)(ln 5 + ln 5) = (1/12)(2 ln 5) = (1/6) ln 5
|
| 123 |
+
|
| 124 |
+
Therefore, the value of the integral is (1/6) ln 5. Let me check the steps again to ensure I didn't make any mistakes.
|
| 125 |
+
|
| 126 |
+
First substitution: t = tan x. That changed variables correctly. The limits from π/4 to arccos(1/√26) became 1 to 5. Then, substitution led to integral in terms of t, simplified to 1/(2t(6 - t)) dt. Partial fractions correctly done, coefficients A and B were both 1/6. Integrated to (1/12)(ln t - ln(6 - t)). Evaluated from 1 to 5, giving (1/12)[ln5 - 0 - (0 - ln5)] = (1/12)(2 ln5) = (1/6) ln5. Seems correct.
|
| 127 |
+
|
| 128 |
+
Alternatively, let me check with another substitution. Suppose instead of t = tan x, maybe another substitution. Let me see.
|
| 129 |
+
|
| 130 |
+
Original integral after simplifying sin2x:
|
| 131 |
+
|
| 132 |
+
(1/2) �� [1/(sinx (6 cosx - sinx))] dx. Let me try substitution u = 6 cosx - sinx.
|
| 133 |
+
|
| 134 |
+
Compute du/dx = -6 sinx - cosx. Hmm, not directly present in the integrand. But the integrand is 1/(sinx * u). Let me see:
|
| 135 |
+
|
| 136 |
+
If I write the integrand as 1/(sinx * u). If I have du = (-6 sinx - cosx) dx. Let me rearrange terms:
|
| 137 |
+
|
| 138 |
+
du = - sinx (6 + cotx) dx. Hmm, not sure. Alternatively, express the integrand in terms of u and du. Let me see:
|
| 139 |
+
|
| 140 |
+
But this seems more complicated. The previous substitution t = tan x worked smoothly, leading to a standard partial fraction integral. So I think the answer is (1/6) ln5. Therefore, the definite integral is (ln5)/6.
|
| 141 |
+
|
| 142 |
+
**Final Answer**
|
| 143 |
+
\boxed{\dfrac{\ln 5}{6}}
|
| 144 |
+
</think>
|
| 145 |
+
|
| 146 |
+
To calculate the definite integral:
|
| 147 |
+
|
| 148 |
+
$$
|
| 149 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6 - \tan x) \sin 2 x}
|
| 150 |
+
$$
|
| 151 |
+
|
| 152 |
+
we start by rewriting \(\sin 2x\) as \(2 \sin x \cos x\). This transforms the integral into:
|
| 153 |
+
|
| 154 |
+
$$
|
| 155 |
+
\int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{1}{(6 - \tan x) \cdot 2 \sin x \cos x} \, dx
|
| 156 |
+
$$
|
| 157 |
+
|
| 158 |
+
Simplifying the denominator using \(\tan x = \frac{\sin x}{\cos x}\), we get:
|
| 159 |
+
|
| 160 |
+
$$
|
| 161 |
+
\frac{1}{2 \sin x \cos x \left(6 - \frac{\sin x}{\cos x}\right)} = \frac{1}{2 \sin x (6 \cos x - \sin x)}
|
| 162 |
+
$$
|
| 163 |
+
|
| 164 |
+
Next, we use the substitution \(t = \tan x\), which gives \(dt = \sec^2 x \, dx\) or \(dx = \frac{dt}{1 + t^2}\). The limits of integration change from \(x = \pi/4\) (where \(t = 1\)) to \(x = \arccos(1/\sqrt{26})\) (where \(t = 5\)). Rewriting \(\sin 2x\) as \(\frac{2t}{1 + t^2}\), the integral becomes:
|
| 165 |
+
|
| 166 |
+
$$
|
| 167 |
+
\int_{1}^{5} \frac{1}{(6 - t) \cdot \frac{2t}{1 + t^2}} \cdot \frac{dt}{1 + t^2}
|
| 168 |
+
$$
|
| 169 |
+
|
| 170 |
+
Simplifying, we get:
|
| 171 |
+
|
| 172 |
+
$$
|
| 173 |
+
\frac{1}{2} \int_{1}^{5} \frac{1}{t(6 - t)} \, dt
|
| 174 |
+
$$
|
| 175 |
+
|
| 176 |
+
Using partial fractions, we decompose \(\frac{1}{t(6 - t)}\) into \(\frac{1}{6t} + \frac{1}{6(6 - t)}\). The integral then becomes:
|
| 177 |
+
|
| 178 |
+
$$
|
| 179 |
+
\frac{1}{12} \int_{1}^{5} \left(\frac{1}{t} + \frac{1}{6 - t}\right) \, dt
|
| 180 |
+
$$
|
| 181 |
+
|
| 182 |
+
Integrating term by term, we get:
|
| 183 |
+
|
| 184 |
+
$$
|
| 185 |
+
\frac{1}{12} \left[ \ln |t| - \ln |6 - t| \right]_{1}^{5}
|
| 186 |
+
$$
|
| 187 |
+
|
| 188 |
+
Evaluating this from 1 to 5:
|
| 189 |
+
|
| 190 |
+
$$
|
| 191 |
+
\frac{1}{12} \left[ (\ln 5 - \ln 1) - (\ln 1 - \ln 5) \right] = \frac{1}{12} (2 \ln 5) = \frac{1}{6} \ln 5
|
| 192 |
+
$$
|
| 193 |
+
|
| 194 |
+
Thus, the value of the integral is:
|
| 195 |
+
|
| 196 |
+
$$
|
| 197 |
+
\boxed{\dfrac{\ln 5}{6}}
|
| 198 |
+
```<|endoftext|>
|
| 199 |
+
|
| 200 |
+
📊 Stats: 80 masked, 2671 trained
|
| 201 |
+
|
| 202 |
+
📥 Loading student model: Qwen/Qwen2.5-1.5B
|
| 203 |
+
`torch_dtype` is deprecated! Use `dtype` instead!
|
| 204 |
+
✅ Student model loaded: 151936 vocab size
|
| 205 |
+
[2025-11-14 14:52:27,291] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 206 |
+
[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
|
| 207 |
+
[93m [WARNING] [0m async_io: please install the libaio-devel package with yum
|
| 208 |
+
[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
| 209 |
+
[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
|
| 210 |
+
[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3
|
| 211 |
+
[93m [WARNING] [0m using untested triton version (2.3.0), only 1.0.0 is known to be compatible
|
| 212 |
+
[2025-11-14 14:52:28,034] [INFO] [comm.py:637:init_distributed] cdb=None
|
| 213 |
+
[2025-11-14 14:52:28,034] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 214 |
+
|
| 215 |
+
============================================================
|
| 216 |
+
🎓 ENTROPY WEIGHTING ENABLED
|
| 217 |
+
============================================================
|
| 218 |
+
Teacher Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 219 |
+
Teacher Device: 2
|
| 220 |
+
Alpha (α): 2.0
|
| 221 |
+
Beta (β): 0.3
|
| 222 |
+
Top-K: 48
|
| 223 |
+
Teacher dtype: bfloat16
|
| 224 |
+
Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
|
| 225 |
+
============================================================
|
| 226 |
+
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
| 227 |
+
|
| 228 |
+
🎓 Loading teacher model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 229 |
+
📊 Entropy weighting params: α=2.0, β=0.3
|
| 230 |
+
💾 Entropy computation: top-k=48 (memory-efficient mode)
|
| 231 |
+
🖥️ Teacher target device(s): cuda:2
|
| 232 |
+
📡 Loading teacher on single GPU: cuda:2
|
| 233 |
+
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 47.60it/s]
|
| 234 |
+
✅ Teacher model loaded and frozen
|
| 235 |
+
✅ Teacher model loaded and frozen
|
| 236 |
+
|
| 237 |
+
🔍 Checking teacher tokenizer/model alignment...
|
| 238 |
+
📌 Teacher tokenizer vocab: 151665
|
| 239 |
+
📌 Teacher model vocab: 152064
|
| 240 |
+
⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...
|
| 241 |
+
✅ Teacher embeddings resized to 151665
|
| 242 |
+
|
| 243 |
+
📊 Student model vocab size: 151936
|
| 244 |
+
📊 Teacher model vocab size (after alignment): 151665
|
| 245 |
+
|
| 246 |
+
⚠️ Student/Teacher vocab size mismatch detected!
|
| 247 |
+
Teacher: 151665
|
| 248 |
+
Student: 151936
|
| 249 |
+
🔧 Resizing student embeddings to match teacher...
|
| 250 |
+
✅ Student embeddings resized to 151665
|
| 251 |
+
New student vocab size: 151665
|
| 252 |
+
|
| 253 |
+
============================================================
|
| 254 |
+
🎯 Final Vocab Alignment Complete
|
| 255 |
+
📊 Teacher vocab size: 151665
|
| 256 |
+
📊 Student vocab size: 151665
|
| 257 |
+
============================================================
|
| 258 |
+
|
| 259 |
+
🏋️ Starting training...
|
| 260 |
+
📊 Total training steps: 5666
|
| 261 |
+
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
|
| 262 |
+
Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
|
| 263 |
+
|
| 264 |
+
{'loss': 2.5591, 'grad_norm': 10.723748207092285, 'learning_rate': 2.3255813953488374e-07, 'avg_weight': 0.3910901114344597, 'epoch': 0.0}
|
| 265 |
+
{'loss': 2.4874, 'grad_norm': 10.057125091552734, 'learning_rate': 5.232558139534884e-07, 'avg_weight': 0.38843219727277756, 'epoch': 0.0}
|
| 266 |
+
{'loss': 2.4994, 'grad_norm': 8.863786697387695, 'learning_rate': 8.139534883720931e-07, 'avg_weight': 0.38760635405778887, 'epoch': 0.01}
|
| 267 |
+
{'loss': 2.4195, 'grad_norm': 8.823384284973145, 'learning_rate': 1.1046511627906977e-06, 'avg_weight': 0.3880971476435661, 'epoch': 0.01}
|
| 268 |
+
{'loss': 2.4098, 'grad_norm': 7.296765327453613, 'learning_rate': 1.3953488372093025e-06, 'avg_weight': 0.3801196217536926, 'epoch': 0.01}
|
| 269 |
+
|
| 270 |
+
{'eval_loss': 0.5676697492599487, 'eval_runtime': 177.1245, 'eval_samples_per_second': 2.586, 'eval_steps_per_second': 0.649, 'epoch': 0.01}
|
| 271 |
+
{'loss': 2.2295, 'grad_norm': 5.418520450592041, 'learning_rate': 1.686046511627907e-06, 'avg_weight': 0.36370994746685026, 'epoch': 0.01}
|
| 272 |
+
{'loss': 2.1721, 'grad_norm': 4.612456798553467, 'learning_rate': 1.976744186046512e-06, 'avg_weight': 0.35689852982759473, 'epoch': 0.01}
|
| 273 |
+
{'loss': 2.0765, 'grad_norm': 4.605122089385986, 'learning_rate': 2.2674418604651163e-06, 'avg_weight': 0.36093987375497816, 'epoch': 0.01}
|
| 274 |
+
{'loss': 2.0677, 'grad_norm': 4.118704795837402, 'learning_rate': 2.558139534883721e-06, 'avg_weight': 0.3577569782733917, 'epoch': 0.02}
|
| 275 |
+
{'loss': 2.1147, 'grad_norm': 3.7384188175201416, 'learning_rate': 2.848837209302326e-06, 'avg_weight': 0.3483647421002388, 'epoch': 0.02}
|
| 276 |
+
{'eval_loss': 0.48641934990882874, 'eval_runtime': 176.9466, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.02}
|
| 277 |
+
{'loss': 1.9462, 'grad_norm': 4.451317310333252, 'learning_rate': 3.1395348837209307e-06, 'avg_weight': 0.3532564118504524, 'epoch': 0.02}
|
| 278 |
+
{'loss': 1.8641, 'grad_norm': 3.378995180130005, 'learning_rate': 3.430232558139535e-06, 'avg_weight': 0.3521373629570007, 'epoch': 0.02}
|
| 279 |
+
{'loss': 1.878, 'grad_norm': 3.9663772583007812, 'learning_rate': 3.72093023255814e-06, 'avg_weight': 0.3486033886671066, 'epoch': 0.02}
|
| 280 |
+
{'loss': 1.8243, 'grad_norm': 3.9792895317077637, 'learning_rate': 4.011627906976744e-06, 'avg_weight': 0.34984882175922394, 'epoch': 0.02}
|
| 281 |
+
{'loss': 1.8915, 'grad_norm': 3.450552463531494, 'learning_rate': 4.302325581395349e-06, 'avg_weight': 0.34732189774513245, 'epoch': 0.03}
|
| 282 |
+
{'eval_loss': 0.44193005561828613, 'eval_runtime': 176.9698, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.03}
|
| 283 |
+
{'loss': 1.6694, 'grad_norm': 3.710488796234131, 'learning_rate': 4.593023255813954e-06, 'avg_weight': 0.3485285028815269, 'epoch': 0.03}
|
| 284 |
+
{'loss': 1.8175, 'grad_norm': 2.9457366466522217, 'learning_rate': 4.883720930232559e-06, 'avg_weight': 0.34327888786792754, 'epoch': 0.03}
|
| 285 |
+
{'loss': 1.799, 'grad_norm': 3.038442611694336, 'learning_rate': 4.999985296579241e-06, 'avg_weight': 0.3441050469875336, 'epoch': 0.03}
|
| 286 |
+
{'loss': 1.6772, 'grad_norm': 3.322579860687256, 'learning_rate': 4.999895442967599e-06, 'avg_weight': 0.34606429785490034, 'epoch': 0.03}
|
| 287 |
+
{'loss': 1.7592, 'grad_norm': 3.061025381088257, 'learning_rate': 4.9997239072437415e-06, 'avg_weight': 0.3396225184202194, 'epoch': 0.04}
|
| 288 |
+
{'eval_loss': 0.4209730625152588, 'eval_runtime': 176.9333, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.04}
|
| 289 |
+
{'loss': 1.663, 'grad_norm': 3.212209463119507, 'learning_rate': 4.999470695012462e-06, 'avg_weight': 0.3401555925607681, 'epoch': 0.04}
|
| 290 |
+
{'loss': 1.6741, 'grad_norm': 2.88710880279541, 'learning_rate': 4.999135814547269e-06, 'avg_weight': 0.3444334402680397, 'epoch': 0.04}
|
| 291 |
+
{'loss': 1.8002, 'grad_norm': 3.0338072776794434, 'learning_rate': 4.99871927679012e-06, 'avg_weight': 0.3414938300848007, 'epoch': 0.04}
|
| 292 |
+
{'loss': 1.6332, 'grad_norm': 2.6287529468536377, 'learning_rate': 4.998221095351058e-06, 'avg_weight': 0.3433367222547531, 'epoch': 0.04}
|
| 293 |
+
{'loss': 1.6309, 'grad_norm': 3.1718740463256836, 'learning_rate': 4.997641286507766e-06, 'avg_weight': 0.3451731190085411, 'epoch': 0.04}
|
| 294 |
+
{'eval_loss': 0.4092359244823456, 'eval_runtime': 176.9574, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.04}
|
| 295 |
+
{'loss': 1.6804, 'grad_norm': 3.1825239658355713, 'learning_rate': 4.996979869205043e-06, 'avg_weight': 0.3412734940648079, 'epoch': 0.05}
|
| 296 |
+
{'loss': 1.6437, 'grad_norm': 3.1620748043060303, 'learning_rate': 4.996236865054177e-06, 'avg_weight': 0.34043743312358854, 'epoch': 0.05}
|
| 297 |
+
{'loss': 1.6659, 'grad_norm': 3.254448175430298, 'learning_rate': 4.995412298332243e-06, 'avg_weight': 0.3406913295388222, 'epoch': 0.05}
|
| 298 |
+
{'loss': 1.6698, 'grad_norm': 3.2264506816864014, 'learning_rate': 4.994506195981309e-06, 'avg_weight': 0.34027899503707887, 'epoch': 0.05}
|
| 299 |
+
{'loss': 1.5531, 'grad_norm': 3.0454981327056885, 'learning_rate': 4.9935185876075525e-06, 'avg_weight': 0.3385377749800682, 'epoch': 0.05}
|
| 300 |
+
{'eval_loss': 0.4014018177986145, 'eval_runtime': 176.9922, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.05}
|
| 301 |
+
{'loss': 1.5782, 'grad_norm': 2.929764747619629, 'learning_rate': 4.992449505480301e-06, 'avg_weight': 0.34045338034629824, 'epoch': 0.05}
|
| 302 |
+
{'loss': 1.5912, 'grad_norm': 3.0769646167755127, 'learning_rate': 4.991298984530968e-06, 'avg_weight': 0.34139142483472823, 'epoch': 0.06}
|
| 303 |
+
{'loss': 1.6423, 'grad_norm': 2.820838689804077, 'learning_rate': 4.9900670623519185e-06, 'avg_weight': 0.3394546613097191, 'epoch': 0.06}
|
| 304 |
+
{'loss': 1.6039, 'grad_norm': 2.8948822021484375, 'learning_rate': 4.98875377919524e-06, 'avg_weight': 0.33894784599542616, 'epoch': 0.06}
|
| 305 |
+
{'loss': 1.5986, 'grad_norm': 2.98633074760437, 'learning_rate': 4.987359177971422e-06, 'avg_weight': 0.3411692440509796, 'epoch': 0.06}
|
| 306 |
+
{'eval_loss': 0.39528319239616394, 'eval_runtime': 176.9244, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.06}
|
| 307 |
+
{'loss': 1.5965, 'grad_norm': 2.802412986755371, 'learning_rate': 4.985883304247961e-06, 'avg_weight': 0.3383798971772194, 'epoch': 0.06}
|
| 308 |
+
{'loss': 1.5762, 'grad_norm': 3.027007818222046, 'learning_rate': 4.984326206247866e-06, 'avg_weight': 0.33840133994817734, 'epoch': 0.07}
|
| 309 |
+
{'loss': 1.5978, 'grad_norm': 2.9903464317321777, 'learning_rate': 4.982687934848086e-06, 'avg_weight': 0.34024504870176314, 'epoch': 0.07}
|
| 310 |
+
{'loss': 1.6444, 'grad_norm': 2.9462625980377197, 'learning_rate': 4.980968543577849e-06, 'avg_weight': 0.33813936412334444, 'epoch': 0.07}
|
| 311 |
+
{'loss': 1.5458, 'grad_norm': 2.9322121143341064, 'learning_rate': 4.979168088616907e-06, 'avg_weight': 0.3389531776309013, 'epoch': 0.07}
|
| 312 |
+
{'eval_loss': 0.39106011390686035, 'eval_runtime': 176.8862, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.07}
|
| 313 |
+
{'loss': 1.5588, 'grad_norm': 3.813068389892578, 'learning_rate': 4.977286628793707e-06, 'avg_weight': 0.3413571178913116, 'epoch': 0.07}
|
| 314 |
+
{'loss': 1.7041, 'grad_norm': 3.0992965698242188, 'learning_rate': 4.975324225583465e-06, 'avg_weight': 0.3345016598701477, 'epoch': 0.07}
|
| 315 |
+
{'loss': 1.5372, 'grad_norm': 2.912554979324341, 'learning_rate': 4.973280943106158e-06, 'avg_weight': 0.33904497176408765, 'epoch': 0.08}
|
| 316 |
+
{'loss': 1.6412, 'grad_norm': 3.341759443283081, 'learning_rate': 4.971156848124429e-06, 'avg_weight': 0.33656598031520846, 'epoch': 0.08}
|
| 317 |
+
{'loss': 1.6438, 'grad_norm': 2.90909481048584, 'learning_rate': 4.968952010041408e-06, 'avg_weight': 0.3322950556874275, 'epoch': 0.08}
|
| 318 |
+
{'eval_loss': 0.38709789514541626, 'eval_runtime': 176.9433, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.08}
|
| 319 |
+
{'loss': 1.5637, 'grad_norm': 3.0614888668060303, 'learning_rate': 4.96666650089844e-06, 'avg_weight': 0.33701644390821456, 'epoch': 0.08}
|
| 320 |
+
{'loss': 1.4937, 'grad_norm': 3.1825013160705566, 'learning_rate': 4.964300395372733e-06, 'avg_weight': 0.33940560817718507, 'epoch': 0.08}
|
| 321 |
+
{'loss': 1.6914, 'grad_norm': 3.442878007888794, 'learning_rate': 4.961853770774921e-06, 'avg_weight': 0.33430094122886655, 'epoch': 0.08}
|
| 322 |
+
{'loss': 1.5223, 'grad_norm': 3.1587722301483154, 'learning_rate': 4.959326707046532e-06, 'avg_weight': 0.33932786285877226, 'epoch': 0.09}
|
| 323 |
+
{'loss': 1.5314, 'grad_norm': 3.011597156524658, 'learning_rate': 4.956719286757381e-06, 'avg_weight': 0.3337151423096657, 'epoch': 0.09}
|
| 324 |
+
{'eval_loss': 0.38383862376213074, 'eval_runtime': 176.8929, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.09}
|
| 325 |
+
{'loss': 1.6525, 'grad_norm': 3.051734447479248, 'learning_rate': 4.9540315951028695e-06, 'avg_weight': 0.336119769513607, 'epoch': 0.09}
|
| 326 |
+
{'loss': 1.6499, 'grad_norm': 3.2344253063201904, 'learning_rate': 4.951263719901203e-06, 'avg_weight': 0.33603269308805467, 'epoch': 0.09}
|
| 327 |
+
{'loss': 1.6005, 'grad_norm': 2.8890116214752197, 'learning_rate': 4.948415751590521e-06, 'avg_weight': 0.3372333973646164, 'epoch': 0.09}
|
| 328 |
+
{'loss': 1.5957, 'grad_norm': 3.270925998687744, 'learning_rate': 4.945487783225942e-06, 'avg_weight': 0.3381495177745819, 'epoch': 0.1}
|
| 329 |
+
{'loss': 1.509, 'grad_norm': 3.0765645503997803, 'learning_rate': 4.9424799104765245e-06, 'avg_weight': 0.33853849917650225, 'epoch': 0.1}
|
| 330 |
+
{'eval_loss': 0.3811159133911133, 'eval_runtime': 176.9156, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.1}
|
| 331 |
+
{'loss': 1.5422, 'grad_norm': 2.94732928276062, 'learning_rate': 4.939392231622136e-06, 'avg_weight': 0.3360708475112915, 'epoch': 0.1}
|
| 332 |
+
{'loss': 1.6281, 'grad_norm': 3.1645963191986084, 'learning_rate': 4.9362248475502515e-06, 'avg_weight': 0.3375927582383156, 'epoch': 0.1}
|
| 333 |
+
{'loss': 1.5221, 'grad_norm': 3.1172220706939697, 'learning_rate': 4.932977861752646e-06, 'avg_weight': 0.3341860607266426, 'epoch': 0.1}
|
| 334 |
+
{'loss': 1.5929, 'grad_norm': 2.895547866821289, 'learning_rate': 4.929651380322019e-06, 'avg_weight': 0.3337728187441826, 'epoch': 0.1}
|
| 335 |
+
{'loss': 1.5591, 'grad_norm': 3.267468214035034, 'learning_rate': 4.9262455119485295e-06, 'avg_weight': 0.3368815451860428, 'epoch': 0.11}
|
| 336 |
+
{'eval_loss': 0.3784671425819397, 'eval_runtime': 176.9466, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.11}
|
train/wandb/run-20251114_145219-w9xre5r3/files/requirements.txt
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mpmath==1.3.0
|
| 2 |
+
typing_extensions==4.15.0
|
| 3 |
+
Jinja2==3.1.6
|
| 4 |
+
multidict==6.7.0
|
| 5 |
+
dill==0.3.8
|
| 6 |
+
charset-normalizer==3.4.4
|
| 7 |
+
yarl==1.22.0
|
| 8 |
+
aiosignal==1.4.0
|
| 9 |
+
accelerate==1.11.0
|
| 10 |
+
omegaconf==2.3.0
|
| 11 |
+
msgpack==1.1.2
|
| 12 |
+
codetiming==1.4.0
|
| 13 |
+
click==8.2.1
|
| 14 |
+
referencing==0.37.0
|
| 15 |
+
opentelemetry-api==1.38.0
|
| 16 |
+
opencensus==0.11.4
|
| 17 |
+
sympy==1.14.0
|
| 18 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
| 19 |
+
nvidia-cusparse-cu12==12.1.0.106
|
| 20 |
+
torch==2.3.0+cu121
|
| 21 |
+
PyYAML==6.0.3
|
| 22 |
+
pyarrow-hotfix==0.7
|
| 23 |
+
exceptiongroup==1.3.0
|
| 24 |
+
multiprocess==0.70.16
|
| 25 |
+
regex==2025.11.3
|
| 26 |
+
markdown-it-py==4.0.0
|
| 27 |
+
py-cpuinfo==9.0.0
|
| 28 |
+
smmap==5.0.2
|
| 29 |
+
sentry-sdk==2.43.0
|
| 30 |
+
rpds-py==0.28.0
|
| 31 |
+
tokenizers==0.22.1
|
| 32 |
+
antlr4-python3-runtime==4.9.3
|
| 33 |
+
pybind11==3.0.1
|
| 34 |
+
Markdown==3.10
|
| 35 |
+
cloudpickle==3.1.2
|
| 36 |
+
pyasn1_modules==0.4.2
|
| 37 |
+
wheel==0.45.1
|
| 38 |
+
urllib3==2.5.0
|
| 39 |
+
tzdata==2025.2
|
| 40 |
+
pyarrow==22.0.0
|
| 41 |
+
certifi==2025.10.5
|
| 42 |
+
typer-slim==0.20.0
|
| 43 |
+
huggingface-hub==0.36.0
|
| 44 |
+
nvidia-ml-py==13.580.82
|
| 45 |
+
pydantic==2.12.4
|
| 46 |
+
deepspeed==0.14.4
|
| 47 |
+
platformdirs==4.5.0
|
| 48 |
+
sentencepiece==0.2.1
|
| 49 |
+
trl==0.25.0
|
| 50 |
+
tensorboard-data-server==0.7.2
|
| 51 |
+
googleapis-common-protos==1.72.0
|
| 52 |
+
hydra-core==1.3.2
|
| 53 |
+
jsonschema-specifications==2025.9.1
|
| 54 |
+
tensordict==0.10.0
|
| 55 |
+
opentelemetry-semantic-conventions==0.59b0
|
| 56 |
+
opentelemetry-exporter-prometheus==0.59b0
|
| 57 |
+
nvidia-cublas-cu12==12.1.3.1
|
| 58 |
+
frozenlist==1.8.0
|
| 59 |
+
sniffio==1.3.1
|
| 60 |
+
packaging==25.0
|
| 61 |
+
h11==0.16.0
|
| 62 |
+
async-timeout==5.0.1
|
| 63 |
+
anyio==4.11.0
|
| 64 |
+
pandas==2.3.3
|
| 65 |
+
httpx==0.28.1
|
| 66 |
+
aiohttp==3.13.2
|
| 67 |
+
typeguard==4.4.4
|
| 68 |
+
Pygments==2.19.2
|
| 69 |
+
docstring_parser==0.17.0
|
| 70 |
+
hjson==3.1.0
|
| 71 |
+
pydantic_core==2.41.5
|
| 72 |
+
ninja==1.13.0
|
| 73 |
+
transformers==4.57.1
|
| 74 |
+
datasets==4.4.1
|
| 75 |
+
zipp==3.23.0
|
| 76 |
+
wrapt==2.0.1
|
| 77 |
+
Werkzeug==3.1.3
|
| 78 |
+
pyvers==0.1.0
|
| 79 |
+
prometheus_client==0.23.1
|
| 80 |
+
grpcio==1.76.0
|
| 81 |
+
cachetools==6.2.2
|
| 82 |
+
smart_open==7.5.0
|
| 83 |
+
rsa==4.9.1
|
| 84 |
+
aiohttp-cors==0.8.1
|
| 85 |
+
opentelemetry-sdk==1.38.0
|
| 86 |
+
nvidia-curand-cu12==10.3.2.106
|
| 87 |
+
fsspec==2024.5.0
|
| 88 |
+
requests==2.32.5
|
| 89 |
+
python-dateutil==2.9.0.post0
|
| 90 |
+
py-spy==0.4.1
|
| 91 |
+
safetensors==0.6.2
|
| 92 |
+
distlib==0.4.0
|
| 93 |
+
psutil==7.1.3
|
| 94 |
+
colorful==0.5.8
|
| 95 |
+
rich==14.2.0
|
| 96 |
+
tyro==0.9.35
|
| 97 |
+
protobuf==6.33.0
|
| 98 |
+
wandb==0.22.3
|
| 99 |
+
pyasn1==0.6.1
|
| 100 |
+
opentelemetry-proto==1.38.0
|
| 101 |
+
torchdata==0.11.0
|
| 102 |
+
pip==25.2
|
| 103 |
+
nvidia-cufft-cu12==11.0.2.54
|
| 104 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 105 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
| 106 |
+
numpy==1.26.4
|
| 107 |
+
nvidia-cudnn-cu12==8.9.2.26
|
| 108 |
+
nvidia-cusolver-cu12==11.4.5.107
|
| 109 |
+
torchvision==0.18.0+cu121
|
| 110 |
+
torchaudio==2.3.0+cu121
|
| 111 |
+
six==1.17.0
|
| 112 |
+
aiohappyeyeballs==2.6.1
|
| 113 |
+
httpcore==1.0.9
|
| 114 |
+
gitdb==4.0.12
|
| 115 |
+
virtualenv==20.35.4
|
| 116 |
+
orjson==3.11.4
|
| 117 |
+
tensorboard==2.20.0
|
| 118 |
+
google-auth==2.43.0
|
| 119 |
+
verl==0.6.0
|
| 120 |
+
setuptools==80.9.0
|
| 121 |
+
nvidia-nvjitlink-cu12==12.9.86
|
| 122 |
+
nvidia-nccl-cu12==2.20.5
|
| 123 |
+
networkx==3.3
|
| 124 |
+
xxhash==3.6.0
|
| 125 |
+
tqdm==4.67.1
|
| 126 |
+
shellingham==1.5.4
|
| 127 |
+
propcache==0.4.1
|
| 128 |
+
idna==3.11
|
| 129 |
+
hf-xet==1.2.0
|
| 130 |
+
attrs==25.4.0
|
| 131 |
+
shtab==1.7.2
|
| 132 |
+
mdurl==0.1.2
|
| 133 |
+
GitPython==3.1.45
|
| 134 |
+
pylatexenc==2.10
|
| 135 |
+
opencensus-context==0.1.3
|
| 136 |
+
absl-py==2.3.1
|
| 137 |
+
importlib_metadata==8.7.0
|
| 138 |
+
jsonschema==4.25.1
|
| 139 |
+
google-api-core==2.28.1
|
| 140 |
+
ray==2.51.1
|
| 141 |
+
pillow==11.3.0
|
| 142 |
+
nvidia-nvtx-cu12==12.1.105
|
| 143 |
+
MarkupSafe==2.1.5
|
| 144 |
+
filelock==3.19.1
|
| 145 |
+
triton==2.3.0
|
| 146 |
+
pytz==2025.2
|
| 147 |
+
peft==0.12.0
|
| 148 |
+
typing-inspection==0.4.2
|
| 149 |
+
annotated-types==0.7.0
|
| 150 |
+
proto-plus==1.26.1
|
train/wandb/run-20251114_145219-w9xre5r3/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
|
| 3 |
+
"python": "CPython 3.10.19",
|
| 4 |
+
"startedAt": "2025-11-14T06:52:19.650156Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--model_name",
|
| 7 |
+
"Qwen/Qwen2.5-1.5B",
|
| 8 |
+
"--dataset_path",
|
| 9 |
+
"./datasets/openr1/Openr1-Math-46k-8192.jsonl",
|
| 10 |
+
"--output_dir",
|
| 11 |
+
"./model_sft_save/Qwen2.5-1.5B-Entropy-solution",
|
| 12 |
+
"--batch_size",
|
| 13 |
+
"2",
|
| 14 |
+
"--grad_accum",
|
| 15 |
+
"4",
|
| 16 |
+
"--learning_rate",
|
| 17 |
+
"5e-6",
|
| 18 |
+
"--epochs",
|
| 19 |
+
"1",
|
| 20 |
+
"--use_entropy_weighting",
|
| 21 |
+
"--teacher_model_path",
|
| 22 |
+
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
| 23 |
+
"--entropy_weight_alpha",
|
| 24 |
+
"2.0",
|
| 25 |
+
"--entropy_weight_beta",
|
| 26 |
+
"0.3",
|
| 27 |
+
"--teacher_dtype",
|
| 28 |
+
"bfloat16",
|
| 29 |
+
"--entropy_top_k",
|
| 30 |
+
"48",
|
| 31 |
+
"--teacher_device_ids",
|
| 32 |
+
"2",
|
| 33 |
+
"--use_deepspeed",
|
| 34 |
+
"--deepspeed_config",
|
| 35 |
+
"deepspeed/dp_stage2.json",
|
| 36 |
+
"--use_wandb",
|
| 37 |
+
"--wandb_project",
|
| 38 |
+
"qwen-math-entropy-sft",
|
| 39 |
+
"--wandb_run_name",
|
| 40 |
+
"qwen2.5-1.5b-46k-entropy-solution"
|
| 41 |
+
],
|
| 42 |
+
"program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py",
|
| 43 |
+
"codePath": "train_qwen_46k_weight.py",
|
| 44 |
+
"codePathLocal": "train_qwen_46k_weight.py",
|
| 45 |
+
"email": "yaning1001@gmail.com",
|
| 46 |
+
"root": "/public/home/lshi/yoAI/projects/Online_CL/train",
|
| 47 |
+
"host": "gpu-h100-07",
|
| 48 |
+
"executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
|
| 49 |
+
"cpu_count": 96,
|
| 50 |
+
"cpu_count_logical": 96,
|
| 51 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 52 |
+
"gpu_count": 8,
|
| 53 |
+
"disk": {
|
| 54 |
+
"/": {
|
| 55 |
+
"total": "469407801344",
|
| 56 |
+
"used": "288248733696"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"memory": {
|
| 60 |
+
"total": "2164142350336"
|
| 61 |
+
},
|
| 62 |
+
"gpu_nvidia": [
|
| 63 |
+
{
|
| 64 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 65 |
+
"memoryTotal": "85520809984",
|
| 66 |
+
"cudaCores": 16896,
|
| 67 |
+
"architecture": "Hopper",
|
| 68 |
+
"uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 72 |
+
"memoryTotal": "85520809984",
|
| 73 |
+
"cudaCores": 16896,
|
| 74 |
+
"architecture": "Hopper",
|
| 75 |
+
"uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 79 |
+
"memoryTotal": "85520809984",
|
| 80 |
+
"cudaCores": 16896,
|
| 81 |
+
"architecture": "Hopper",
|
| 82 |
+
"uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 86 |
+
"memoryTotal": "85520809984",
|
| 87 |
+
"cudaCores": 16896,
|
| 88 |
+
"architecture": "Hopper",
|
| 89 |
+
"uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 93 |
+
"memoryTotal": "85520809984",
|
| 94 |
+
"cudaCores": 16896,
|
| 95 |
+
"architecture": "Hopper",
|
| 96 |
+
"uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 100 |
+
"memoryTotal": "85520809984",
|
| 101 |
+
"cudaCores": 16896,
|
| 102 |
+
"architecture": "Hopper",
|
| 103 |
+
"uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 107 |
+
"memoryTotal": "85520809984",
|
| 108 |
+
"cudaCores": 16896,
|
| 109 |
+
"architecture": "Hopper",
|
| 110 |
+
"uuid": "GPU-23628f74-fede-6431-ae15-2764fce29130"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 114 |
+
"memoryTotal": "85520809984",
|
| 115 |
+
"cudaCores": 16896,
|
| 116 |
+
"architecture": "Hopper",
|
| 117 |
+
"uuid": "GPU-d18d570f-dd0f-0ff6-3401-561c9e799136"
|
| 118 |
+
}
|
| 119 |
+
],
|
| 120 |
+
"cudaVersion": "12.4",
|
| 121 |
+
"slurm": {
|
| 122 |
+
"home": "/opt/gridview/slurm",
|
| 123 |
+
"pmix_direct_conn": "true",
|
| 124 |
+
"pmix_direct_conn_early": "false",
|
| 125 |
+
"pmix_direct_conn_ucx": "false",
|
| 126 |
+
"pmix_timeout": "3000"
|
| 127 |
+
},
|
| 128 |
+
"writerId": "4ns8zmo5ar2v4v1bdwhe5waa6417g92a"
|
| 129 |
+
}
|
train/wandb/run-20251114_145219-w9xre5r3/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/avg_weight":0.3368815451860428,"train/epoch":0.10587612493382742,"eval/samples_per_second":2.588,"eval/steps_per_second":0.65,"_wandb":{"runtime":5180},"train/global_step":300,"train/learning_rate":4.9262455119485295e-06,"train/grad_norm":3.267468214035034,"eval/runtime":176.9466,"_timestamp":1.7631082634970565e+09,"_step":71,"eval/loss":0.3784671425819397,"_runtime":5180,"train/loss":1.5591}
|
train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-11-14T14:52:19.890537747+08:00","level":"INFO","msg":"stream: starting","core version":"0.22.3"}
|
| 2 |
+
{"time":"2025-11-14T14:52:22.294264931+08:00","level":"INFO","msg":"stream: created new stream","id":"w9xre5r3"}
|
| 3 |
+
{"time":"2025-11-14T14:52:22.294429698+08:00","level":"INFO","msg":"handler: started","stream_id":"w9xre5r3"}
|
| 4 |
+
{"time":"2025-11-14T14:52:22.295381437+08:00","level":"INFO","msg":"stream: started","id":"w9xre5r3"}
|
| 5 |
+
{"time":"2025-11-14T14:52:22.295391392+08:00","level":"INFO","msg":"writer: started","stream_id":"w9xre5r3"}
|
| 6 |
+
{"time":"2025-11-14T14:52:22.295405705+08:00","level":"INFO","msg":"sender: started","stream_id":"w9xre5r3"}
|
| 7 |
+
{"time":"2025-11-14T15:49:09.252958086+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/yaning1001-dartmouth-college/qwen-math-entropy-sft/w9xre5r3/file_stream\": unexpected EOF"}
|
| 8 |
+
{"time":"2025-11-14T16:12:36.32995217+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/yaning1001-dartmouth-college/qwen-math-entropy-sft/w9xre5r3/file_stream\": unexpected EOF"}
|
| 9 |
+
{"time":"2025-11-14T16:18:43.711810258+08:00","level":"INFO","msg":"stream: closing","id":"w9xre5r3"}
|
train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Current SDK version is 0.22.3
|
| 2 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Configure stats pid to 1134562
|
| 3 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from /public/home/lshi/.config/wandb/settings
|
| 4 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from /public/home/lshi/yoAI/projects/Online_CL/train/wandb/settings
|
| 5 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 6 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:setup_run_log_directory():706] Logging user logs to /public/home/lshi/yoAI/projects/Online_CL/train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log
|
| 7 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:setup_run_log_directory():707] Logging internal logs to /public/home/lshi/yoAI/projects/Online_CL/train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log
|
| 8 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():833] calling init triggers
|
| 9 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():838] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():881] starting backend
|
| 12 |
+
2025-11-14 14:52:19,882 INFO MainThread:1134562 [wandb_init.py:init():884] sending inform_init request
|
| 13 |
+
2025-11-14 14:52:19,886 INFO MainThread:1134562 [wandb_init.py:init():892] backend started and connected
|
| 14 |
+
2025-11-14 14:52:19,887 INFO MainThread:1134562 [wandb_init.py:init():962] updated telemetry
|
| 15 |
+
2025-11-14 14:52:19,888 INFO MainThread:1134562 [wandb_init.py:init():986] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-11-14 14:52:22,838 INFO MainThread:1134562 [wandb_init.py:init():1033] starting run threads in backend
|
| 17 |
+
2025-11-14 14:52:22,926 INFO MainThread:1134562 [wandb_run.py:_console_start():2506] atexit reg
|
| 18 |
+
2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2354] redirect: wrap_raw
|
| 19 |
+
2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2423] Wrapping output streams.
|
| 20 |
+
2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2446] Redirects installed.
|
| 21 |
+
2025-11-14 14:52:22,929 INFO MainThread:1134562 [wandb_init.py:init():1073] run started, returning control to user process
|
| 22 |
+
2025-11-14 14:53:10,780 INFO MainThread:1134562 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 151665, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': True, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': 'bfloat16', 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 8192, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'Qwen/Qwen2.5-1.5B', 'transformers_version': '4.57.1', 'model_type': 'qwen2', 'use_mrope': False, 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False, 'output_dir': './model_sft_save/Qwen2.5-1.5B-Entropy-solution', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_14-52-26_gpu-h100-07', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 50, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'qwen2.5-1.5b-46k-entropy-solution', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': 'deepspeed/dp_stage2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'model_init_kwargs': None, 'chat_template_path': None, 'dataset_text_field': None, 'dataset_kwargs': None, 'dataset_num_proc': None, 'eos_token': '<EOS_TOKEN>', 'pad_token': '<PAD_TOKEN>', 'packing': False, 'packing_strategy': 'bfd', 'padding_free': False, 'pad_to_multiple_of': None, 'eval_packing': None, 'completion_only_loss': None, 'assistant_only_loss': False, 'loss_type': 'nll', 'activation_offloading': False, 'teacher_model_path': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', 'entropy_weight_alpha': 2.0, 'entropy_weight_beta': 0.3, 'use_entropy_weighting': True, 'teacher_dtype': 'bfloat16', 'entropy_top_k': 48, 'teacher_device_ids': '2'}
|
| 23 |
+
2025-11-14 14:53:10,783 INFO MainThread:1134562 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1543298048 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x15291a429570>>
|
| 24 |
+
2025-11-14 14:53:10,784 INFO MainThread:1134562 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 1543298048 None
|
| 25 |
+
2025-11-14 16:18:43,711 INFO wandb-AsyncioManager-main:1134562 [service_client.py:_forward_responses():80] Reached EOF.
|
| 26 |
+
2025-11-14 16:18:43,711 INFO wandb-AsyncioManager-main:1134562 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
train/wandb/run-20251114_145222-i8zbx8vz/files/config.yaml
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.22.3
|
| 4 |
+
e:
|
| 5 |
+
2utx15k5rol3shdkavsb8ec17gbbtrpd:
|
| 6 |
+
args:
|
| 7 |
+
- --model_name
|
| 8 |
+
- Qwen/Qwen2.5-1.5B
|
| 9 |
+
- --dataset_path
|
| 10 |
+
- ./datasets/openr1/Openr1-Math-46k-8192.jsonl
|
| 11 |
+
- --output_dir
|
| 12 |
+
- ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
|
| 13 |
+
- --batch_size
|
| 14 |
+
- "2"
|
| 15 |
+
- --grad_accum
|
| 16 |
+
- "4"
|
| 17 |
+
- --learning_rate
|
| 18 |
+
- "5e-6"
|
| 19 |
+
- --epochs
|
| 20 |
+
- "1"
|
| 21 |
+
- --use_entropy_weighting
|
| 22 |
+
- --teacher_model_path
|
| 23 |
+
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
| 24 |
+
- --entropy_weight_alpha
|
| 25 |
+
- "2.0"
|
| 26 |
+
- --entropy_weight_beta
|
| 27 |
+
- "0.3"
|
| 28 |
+
- --teacher_dtype
|
| 29 |
+
- bfloat16
|
| 30 |
+
- --entropy_top_k
|
| 31 |
+
- "48"
|
| 32 |
+
- --teacher_device_ids
|
| 33 |
+
- "2"
|
| 34 |
+
- --use_deepspeed
|
| 35 |
+
- --deepspeed_config
|
| 36 |
+
- deepspeed/dp_stage2.json
|
| 37 |
+
- --use_wandb
|
| 38 |
+
- --wandb_project
|
| 39 |
+
- qwen-math-entropy-sft
|
| 40 |
+
- --wandb_run_name
|
| 41 |
+
- qwen2.5-1.5b-46k-entropy-solution
|
| 42 |
+
codePath: train_qwen_46k_weight.py
|
| 43 |
+
codePathLocal: train_qwen_46k_weight.py
|
| 44 |
+
cpu_count: 96
|
| 45 |
+
cpu_count_logical: 96
|
| 46 |
+
cudaVersion: "12.4"
|
| 47 |
+
disk:
|
| 48 |
+
/:
|
| 49 |
+
total: "469407801344"
|
| 50 |
+
used: "288248737792"
|
| 51 |
+
email: yaning1001@gmail.com
|
| 52 |
+
executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
|
| 53 |
+
gpu: NVIDIA H100 80GB HBM3
|
| 54 |
+
gpu_count: 8
|
| 55 |
+
gpu_nvidia:
|
| 56 |
+
- architecture: Hopper
|
| 57 |
+
cudaCores: 16896
|
| 58 |
+
memoryTotal: "85520809984"
|
| 59 |
+
name: NVIDIA H100 80GB HBM3
|
| 60 |
+
uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
|
| 61 |
+
- architecture: Hopper
|
| 62 |
+
cudaCores: 16896
|
| 63 |
+
memoryTotal: "85520809984"
|
| 64 |
+
name: NVIDIA H100 80GB HBM3
|
| 65 |
+
uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
|
| 66 |
+
- architecture: Hopper
|
| 67 |
+
cudaCores: 16896
|
| 68 |
+
memoryTotal: "85520809984"
|
| 69 |
+
name: NVIDIA H100 80GB HBM3
|
| 70 |
+
uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
|
| 71 |
+
- architecture: Hopper
|
| 72 |
+
cudaCores: 16896
|
| 73 |
+
memoryTotal: "85520809984"
|
| 74 |
+
name: NVIDIA H100 80GB HBM3
|
| 75 |
+
uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
|
| 76 |
+
- architecture: Hopper
|
| 77 |
+
cudaCores: 16896
|
| 78 |
+
memoryTotal: "85520809984"
|
| 79 |
+
name: NVIDIA H100 80GB HBM3
|
| 80 |
+
uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
|
| 81 |
+
- architecture: Hopper
|
| 82 |
+
cudaCores: 16896
|
| 83 |
+
memoryTotal: "85520809984"
|
| 84 |
+
name: NVIDIA H100 80GB HBM3
|
| 85 |
+
uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
|
| 86 |
+
- architecture: Hopper
|
| 87 |
+
cudaCores: 16896
|
| 88 |
+
memoryTotal: "85520809984"
|
| 89 |
+
name: NVIDIA H100 80GB HBM3
|
| 90 |
+
uuid: GPU-23628f74-fede-6431-ae15-2764fce29130
|
| 91 |
+
- architecture: Hopper
|
| 92 |
+
cudaCores: 16896
|
| 93 |
+
memoryTotal: "85520809984"
|
| 94 |
+
name: NVIDIA H100 80GB HBM3
|
| 95 |
+
uuid: GPU-d18d570f-dd0f-0ff6-3401-561c9e799136
|
| 96 |
+
host: gpu-h100-07
|
| 97 |
+
memory:
|
| 98 |
+
total: "2164142350336"
|
| 99 |
+
os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
|
| 100 |
+
program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
|
| 101 |
+
python: CPython 3.10.19
|
| 102 |
+
root: /public/home/lshi/yoAI/projects/Online_CL/train
|
| 103 |
+
slurm:
|
| 104 |
+
home: /opt/gridview/slurm
|
| 105 |
+
pmix_direct_conn: "true"
|
| 106 |
+
pmix_direct_conn_early: "false"
|
| 107 |
+
pmix_direct_conn_ucx: "false"
|
| 108 |
+
pmix_timeout: "3000"
|
| 109 |
+
startedAt: "2025-11-14T06:52:22.127959Z"
|
| 110 |
+
writerId: 2utx15k5rol3shdkavsb8ec17gbbtrpd
|
| 111 |
+
m: []
|
| 112 |
+
python_version: 3.10.19
|
| 113 |
+
t:
|
| 114 |
+
"1":
|
| 115 |
+
- 1
|
| 116 |
+
- 11
|
| 117 |
+
- 41
|
| 118 |
+
- 49
|
| 119 |
+
- 51
|
| 120 |
+
- 71
|
| 121 |
+
- 84
|
| 122 |
+
- 98
|
| 123 |
+
- 105
|
| 124 |
+
"2":
|
| 125 |
+
- 1
|
| 126 |
+
- 11
|
| 127 |
+
- 41
|
| 128 |
+
- 49
|
| 129 |
+
- 51
|
| 130 |
+
- 71
|
| 131 |
+
- 84
|
| 132 |
+
- 98
|
| 133 |
+
- 105
|
| 134 |
+
"3":
|
| 135 |
+
- 13
|
| 136 |
+
"4": 3.10.19
|
| 137 |
+
"5": 0.22.3
|
| 138 |
+
"6": 4.57.1
|
| 139 |
+
"10":
|
| 140 |
+
- 20
|
| 141 |
+
"12": 0.22.3
|
| 142 |
+
"13": linux-x86_64
|