Yaning1001 commited on
Commit
232ac88
·
verified ·
1 Parent(s): f192b46

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. math_evaluation/__pycache__/evaluate.cpython-310.pyc +0 -0
  2. math_evaluation/__pycache__/trajectory.cpython-310.pyc +0 -0
  3. math_evaluation/outputs/Qwen/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  4. math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
  5. math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  6. math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  7. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/LUFFY/data/save_model/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  8. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
  9. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  10. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1_qwen-boxed_metrics.json +13 -0
  11. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  12. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Full-solution/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  13. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2450/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json +13 -0
  14. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  15. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  16. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  17. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-test-1_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  18. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-2/checkpoint-850/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  19. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-100/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  20. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-500/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  21. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  22. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  23. math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full/checkpoint-2650/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl +0 -0
  24. math_evaluation/sh/eval_1.sh +111 -0
  25. train/ETrainer.py +599 -0
  26. train/test_1.py +662 -0
  27. train/test_on_math.py +71 -0
  28. train/test_ood_python_lora_rope.py +203 -0
  29. train/train_qwen_verl_46k.py +0 -0
  30. train/train_qwen_verl_46k.sh +0 -0
  31. train/wandb/run-20251113_165350-n56lk6p0/logs/debug-internal.log +6 -0
  32. train/wandb/run-20251113_171624-kgxigylp/files/wandb-metadata.json +117 -0
  33. train/wandb/run-20251114_040305-jb702f8e/files/wandb-summary.json +1 -0
  34. train/wandb/run-20251114_083110-mocjk23v/files/wandb-summary.json +1 -0
  35. train/wandb/run-20251114_085634-l1whc2fu/logs/debug-core.log +14 -0
  36. train/wandb/run-20251114_093516-syhj5u87/files/config.yaml +179 -0
  37. train/wandb/run-20251114_093516-syhj5u87/logs/debug-core.log +12 -0
  38. train/wandb/run-20251114_103643-cvm4116u/files/config.yaml +698 -0
  39. train/wandb/run-20251114_103643-cvm4116u/files/output.log +262 -0
  40. train/wandb/run-20251114_103643-cvm4116u/run-cvm4116u.wandb +0 -0
  41. train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-metadata.json +162 -0
  42. train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-summary.json +1 -0
  43. train/wandb/run-20251114_145219-w9xre5r3/files/config.yaml +659 -0
  44. train/wandb/run-20251114_145219-w9xre5r3/files/output.log +336 -0
  45. train/wandb/run-20251114_145219-w9xre5r3/files/requirements.txt +150 -0
  46. train/wandb/run-20251114_145219-w9xre5r3/files/wandb-metadata.json +129 -0
  47. train/wandb/run-20251114_145219-w9xre5r3/files/wandb-summary.json +1 -0
  48. train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log +9 -0
  49. train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log +26 -0
  50. train/wandb/run-20251114_145222-i8zbx8vz/files/config.yaml +142 -0
math_evaluation/__pycache__/evaluate.cpython-310.pyc ADDED
Binary file (4.15 kB). View file
 
math_evaluation/__pycache__/trajectory.cpython-310.pyc ADDED
Binary file (5.37 kB). View file
 
math_evaluation/outputs/Qwen/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 32,
5
+ "empty_samples": 1,
6
+ "acc": 59.6,
7
+ "all_acc": [
8
+ 59.6
9
+ ],
10
+ "mean_acc": 59.6,
11
+ "time_use_in_second": 40.11754846572876,
12
+ "time_use_in_minite": "0:40"
13
+ }
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/Qwen/Qwen2.5-Math-1.5B/math_eval/math_oai/test_qwen25-math-cot_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/LUFFY/data/save_model/Qwen2.5-1.5B/math_eval/math_oai/test_abel_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "all_acc": [
8
+ 6.7
9
+ ],
10
+ "mean_acc": 6.7,
11
+ "time_use_in_second": 8.051186084747314,
12
+ "time_use_in_minite": "0:08"
13
+ }
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution-1/checkpoint-2750/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1_qwen-boxed_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 54,
5
+ "empty_samples": 56,
6
+ "acc": 32.0,
7
+ "all_acc": [
8
+ 32.0
9
+ ],
10
+ "mean_acc": 32.0,
11
+ "time_use_in_second": 62.29504370689392,
12
+ "time_use_in_minite": "1:02"
13
+ }
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Entropy-solution/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-1.5B-Full-solution/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2450/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1_llama-base-boxed_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 72.6,
7
+ "all_acc": [
8
+ 72.6
9
+ ],
10
+ "mean_acc": 72.6,
11
+ "time_use_in_second": 18.515631198883057,
12
+ "time_use_in_minite": "0:18"
13
+ }
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/checkpoint-2800/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-1/math_eval/math_oai/test_qwen-test-1_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-2/checkpoint-850/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-100/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Entropy-solution-rope/checkpoint-500/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full-solution/checkpoint-2800/math_eval/math_oai/test_qwen-test_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/outputs/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Full/checkpoint-2650/math_eval/aime24/test_llama-base-boxed_-1_seed0_t0.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
math_evaluation/sh/eval_1.sh ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -ex
2
+
3
+ PROMPT_TYPE=$1
4
+ MODEL_NAME_OR_PATH=$2
5
+ OUTPUT_DIR=$3
6
+ n_sampling=$4
7
+ temperature=$5
8
+
9
+ SPLIT="test"
10
+ NUM_TEST_SAMPLE=-1
11
+
12
+ # English open datasets
13
+ DATA_NAME="math_oai,minerva_math,olympiadbench"
14
+ # DATA_NAME="gsm8k,math,svamp,asdiv,mawps,carp_en,tabmwp,minerva_math,gaokao2023en,olympiadbench,college_math"
15
+ TOKENIZERS_PARALLELISM=false \
16
+ python3 -u math_eval.py \
17
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
18
+ --data_name ${DATA_NAME} \
19
+ --output_dir ${OUTPUT_DIR} \
20
+ --split ${SPLIT} \
21
+ --prompt_type ${PROMPT_TYPE} \
22
+ --num_test_sample ${NUM_TEST_SAMPLE} \
23
+ --seed 0 \
24
+ --temperature ${temperature} \
25
+ --n_sampling ${n_sampling} \
26
+ --top_p 1 \
27
+ --start 0 \
28
+ --end -1 \
29
+ --use_vllm
30
+
31
+ # English multiple-choice datasets
32
+ # DATA_NAME="aqua,sat_math,mmlu_stem"
33
+ # TOKENIZERS_PARALLELISM=false \
34
+ # python3 -u math_eval.py \
35
+ # --model_name_or_path ${MODEL_NAME_OR_PATH} \
36
+ # --data_name ${DATA_NAME} \
37
+ # --output_dir ${OUTPUT_DIR} \
38
+ # --split ${SPLIT} \
39
+ # --prompt_type ${PROMPT_TYPE} \
40
+ # --num_test_sample ${NUM_TEST_SAMPLE} \
41
+ # --seed 0 \
42
+ # --temperature 0 \
43
+ # --n_sampling 1 \
44
+ # --top_p 1 \
45
+ # --start 0 \
46
+ # --end -1 \
47
+ # --use_vllm \
48
+ # --save_outputs \
49
+ # --overwrite \
50
+ # --num_shots 5
51
+
52
+ # Chinese gaokao collections
53
+ # DATA_NAME="gaokao2024_I,gaokao2024_II,gaokao2024_mix,gaokao_math_cloze,gaokao_math_qa"
54
+ # TOKENIZERS_PARALLELISM=false \
55
+ # python3 -u math_eval.py \
56
+ # --model_name_or_path ${MODEL_NAME_OR_PATH} \
57
+ # --data_name ${DATA_NAME} \
58
+ # --output_dir ${OUTPUT_DIR} \
59
+ # --split ${SPLIT} \
60
+ # --prompt_type ${PROMPT_TYPE} \
61
+ # --num_test_sample ${NUM_TEST_SAMPLE} \
62
+ # --seed 0 \
63
+ # --temperature 0 \
64
+ # --n_sampling 1 \
65
+ # --top_p 1 \
66
+ # --start 0 \
67
+ # --end -1 \
68
+ # --use_vllm \
69
+ # --save_outputs \
70
+ # --overwrite \
71
+ # --adapt_few_shot
72
+
73
+ # Chinese other datasets
74
+ # DATA_NAME="cmath,cn_middle_school"
75
+ # TOKENIZERS_PARALLELISM=false \
76
+ # python3 -u math_eval.py \
77
+ # --model_name_or_path ${MODEL_NAME_OR_PATH} \
78
+ # --data_name ${DATA_NAME} \
79
+ # --output_dir ${OUTPUT_DIR} \
80
+ # --split ${SPLIT} \
81
+ # --prompt_type ${PROMPT_TYPE} \
82
+ # --num_test_sample ${NUM_TEST_SAMPLE} \
83
+ # --seed 0 \
84
+ # --temperature 0 \
85
+ # --n_sampling 1 \
86
+ # --top_p 1 \
87
+ # --start 0 \
88
+ # --end -1 \
89
+ # --use_vllm \
90
+ # --save_outputs \
91
+ # --overwrite \
92
+ # --adapt_few_shot
93
+
94
+
95
+ # English competition datasets
96
+ DATA_NAME="aime24,amc23"
97
+ TOKENIZERS_PARALLELISM=false \
98
+ python3 -u math_eval.py \
99
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
100
+ --data_name ${DATA_NAME} \
101
+ --output_dir ${OUTPUT_DIR} \
102
+ --split ${SPLIT} \
103
+ --prompt_type ${PROMPT_TYPE} \
104
+ --num_test_sample ${NUM_TEST_SAMPLE} \
105
+ --seed 0 \
106
+ --temperature ${temperature} \
107
+ --n_sampling ${n_sampling} \
108
+ --top_p 1 \
109
+ --start 0 \
110
+ --end -1 \
111
+ --use_vllm
train/ETrainer.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020-2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Entropy-weighted Trainer (ETrainer) for knowledge distillation with dynamic loss weighting.
17
+ Based on TRL's SFTTrainer with modifications for teacher-student entropy-based weighting.
18
+ """
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ from typing import Any, Dict
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
+ from trl import SFTTrainer
25
+ from dataclasses import dataclass, field
26
+ from trl import SFTConfig
27
+ import transformers
28
+
29
+
30
+ @dataclass
31
+ class ETrainerConfig(SFTConfig):
32
+ """
33
+ Extended SFTConfig with teacher model parameters for entropy-weighted training.
34
+
35
+ Args:
36
+ teacher_model_path (`str`, *optional*):
37
+ Path to the teacher model (e.g., "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B").
38
+ If None, no entropy weighting is applied.
39
+ entropy_weight_alpha (`float`, defaults to 2.0):
40
+ Alpha parameter controlling the sensitivity of entropy difference weighting.
41
+ Higher values make the weighting more sensitive to entropy differences.
42
+ entropy_weight_beta (`float`, defaults to 0.3):
43
+ Beta parameter controlling the zero-point offset for entropy difference.
44
+ Determines the threshold for significant entropy differences.
45
+ use_entropy_weighting (`bool`, defaults to True):
46
+ Whether to apply entropy-based loss weighting. Set to False to use standard loss.
47
+ teacher_dtype (`str`, defaults to "bfloat16"):
48
+ Data type for teacher model ("float16", "bfloat16", or "float32").
49
+ entropy_top_k (`int`, defaults to 64):
50
+ Number of top tokens to use for entropy approximation. Set to None or vocab_size
51
+ to compute exact entropy. Smaller values save memory significantly.
52
+ """
53
+ teacher_model_path: str | None = field(
54
+ default=None,
55
+ metadata={"help": "Path to teacher model for entropy weighting"}
56
+ )
57
+ entropy_weight_alpha: float = field(
58
+ default=2.0,
59
+ metadata={"help": "Alpha parameter for entropy weighting sensitivity"}
60
+ )
61
+ entropy_weight_beta: float = field(
62
+ default=0.3,
63
+ metadata={"help": "Beta parameter for entropy weighting offset"}
64
+ )
65
+ use_entropy_weighting: bool = field(
66
+ default=True,
67
+ metadata={"help": "Whether to apply entropy-based loss weighting"}
68
+ )
69
+ teacher_dtype: str = field(
70
+ default="bfloat16",
71
+ metadata={"help": "Teacher model dtype (float16/bfloat16/float32)"}
72
+ )
73
+ entropy_top_k: int = field(
74
+ default=64,
75
+ metadata={"help": "Top-K tokens for entropy approximation (reduces memory)"}
76
+ )
77
+ teacher_device_ids: str | None = field(
78
+ default=None,
79
+ metadata={
80
+ "help": "GPU device IDs for teacher model (e.g., '0', '0,1', 'cuda:2'). "
81
+ "If None, uses same device as student. Separate GPUs enable parallel computation."
82
+ }
83
+ )
84
+
85
+
86
+ class ETrainer(SFTTrainer):
87
+ """
88
+ Entropy-weighted Trainer for knowledge distillation.
89
+
90
+ This trainer extends SFTTrainer by adding dynamic per-token loss weighting based on
91
+ teacher-student entropy differences. The weight formula is:
92
+
93
+ w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
94
+
95
+ Where:
96
+ - H_t(j): Teacher's entropy at position j (lower = more confident)
97
+ - H_s(j): Student's entropy at position j
98
+ - α: Sensitivity parameter (default 2.0)
99
+ - β: Offset parameter (default 0.3)
100
+
101
+ Key features:
102
+ - Higher weight when teacher is confident (low H_t)
103
+ - Higher weight when student differs significantly from teacher (large |ΔH|)
104
+ - Smooth weighting via sigmoid to avoid gradient explosion
105
+ """
106
+
107
+ def __init__(self, *args, **kwargs):
108
+ # Extract ETrainer-specific arguments before passing to parent
109
+ args_obj = kwargs.get('args', None)
110
+
111
+ # Initialize parent SFTTrainer
112
+ super().__init__(*args, **kwargs)
113
+
114
+ # Load teacher model if specified
115
+ self.teacher_model = None
116
+ self.use_entropy_weighting = False
117
+
118
+ # Initialize entropy-specific metrics
119
+ # Note: Only initialize metrics we actually populate to avoid ZeroDivisionError
120
+ if not hasattr(self, '_metrics'):
121
+ self._metrics = {'train': {}, 'eval': {}}
122
+
123
+ # Add entropy-specific metric keys
124
+ for mode in ['train', 'eval']:
125
+ if mode not in self._metrics:
126
+ self._metrics[mode] = {}
127
+ # Only initialize avg_weight since we're not logging entropy to save memory
128
+ self._metrics[mode].setdefault('avg_weight', [])
129
+
130
+ if args_obj and hasattr(args_obj, 'teacher_model_path') and args_obj.teacher_model_path:
131
+ self.use_entropy_weighting = args_obj.use_entropy_weighting
132
+ self.entropy_weight_alpha = args_obj.entropy_weight_alpha
133
+ self.entropy_weight_beta = args_obj.entropy_weight_beta
134
+ self.entropy_top_k = getattr(args_obj, 'entropy_top_k', 64)
135
+
136
+ if self.use_entropy_weighting:
137
+ print(f"🎓 Loading teacher model: {args_obj.teacher_model_path}")
138
+ print(f"📊 Entropy weighting params: α={self.entropy_weight_alpha}, β={self.entropy_weight_beta}")
139
+ print(f"💾 Entropy computation: top-k={self.entropy_top_k} (memory-efficient mode)")
140
+
141
+ # Load teacher model to specified device(s)
142
+ self.teacher_device = self._load_teacher_model(args_obj)
143
+
144
+ print("✅ Teacher model loaded and frozen")
145
+
146
+ # Freeze teacher model
147
+ for param in self.teacher_model.parameters():
148
+ param.requires_grad = False
149
+
150
+ print("✅ Teacher model loaded and frozen")
151
+
152
+ # --- Fix teacher tokenizer/model vocab mismatch ---
153
+ print("\n🔍 Checking teacher tokenizer/model alignment...")
154
+ teacher_tokenizer = AutoTokenizer.from_pretrained(
155
+ args_obj.teacher_model_path,
156
+ trust_remote_code=True
157
+ )
158
+
159
+ tokenizer_vocab = len(teacher_tokenizer)
160
+ model_vocab = self.teacher_model.config.vocab_size
161
+
162
+ print(f"📌 Teacher tokenizer vocab: {tokenizer_vocab}")
163
+ print(f"📌 Teacher model vocab: {model_vocab}")
164
+
165
+ if tokenizer_vocab != model_vocab:
166
+ print("⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...")
167
+ self.teacher_model.resize_token_embeddings(tokenizer_vocab)
168
+ self.teacher_model.config.vocab_size = tokenizer_vocab
169
+ print(f"✅ Teacher embeddings resized to {tokenizer_vocab}")
170
+ else:
171
+ print("✅ Teacher tokenizer and model vocab already match")
172
+
173
+ # --- Now align student with teacher ---
174
+ print(f"\n📊 Student model vocab size: {self.model.config.vocab_size}")
175
+ print(f"📊 Teacher model vocab size (after alignment): {self.teacher_model.config.vocab_size}")
176
+
177
+ # Handle vocab size mismatch by resizing student embeddings to match teacher
178
+ if self.teacher_model.config.vocab_size != self.model.config.vocab_size:
179
+ print(f"\n⚠️ Student/Teacher vocab size mismatch detected!")
180
+ print(f" Teacher: {self.teacher_model.config.vocab_size}")
181
+ print(f" Student: {self.model.config.vocab_size}")
182
+ print(f"🔧 Resizing student embeddings to match teacher...")
183
+
184
+ # Resize student model embeddings (new tokens initialized with mean of existing)
185
+ self.model.resize_token_embeddings(self.teacher_model.config.vocab_size)
186
+
187
+ print(f"✅ Student embeddings resized to {self.teacher_model.config.vocab_size}")
188
+ print(f" New student vocab size: {self.model.config.vocab_size}")
189
+ else:
190
+ print(f"✅ Student and teacher vocab sizes already match: {self.model.config.vocab_size}")
191
+
192
+ print("\n" + "="*60)
193
+ print(f"🎯 Final Vocab Alignment Complete")
194
+ print(f"📊 Teacher vocab size: {self.teacher_model.config.vocab_size}")
195
+ print(f"📊 Student vocab size: {self.model.config.vocab_size}")
196
+ print("="*60)
197
+
198
+ def _parse_teacher_devices(self, teacher_device_ids: str | None) -> str | list:
199
+ """
200
+ Parse teacher device IDs string into device specification.
201
+
202
+ Args:
203
+ teacher_device_ids: Device string like "0", "0,1", "cuda:2", or None
204
+
205
+ Returns:
206
+ Single device string (e.g., "cuda:0") or list of devices for multi-GPU
207
+ """
208
+ if teacher_device_ids is None:
209
+ # Default: use cuda:0 if available
210
+ return "cuda:0" if torch.cuda.is_available() else "cpu"
211
+
212
+ # Clean up the input
213
+ teacher_device_ids = teacher_device_ids.strip()
214
+
215
+ # Handle comma-separated multi-GPU case
216
+ if ',' in teacher_device_ids:
217
+ device_list = [f"cuda:{id.strip()}" if not id.strip().startswith('cuda:')
218
+ else id.strip()
219
+ for id in teacher_device_ids.split(',')]
220
+ return device_list
221
+
222
+ # Single device case
223
+ if not teacher_device_ids.startswith('cuda:'):
224
+ return f"cuda:{teacher_device_ids}"
225
+ return teacher_device_ids
226
+
227
+ def _load_teacher_model(self, args_obj):
228
+ """
229
+ Load teacher model to specified device(s).
230
+
231
+ This method handles both single-GPU and multi-GPU teacher configurations.
232
+ """
233
+ # Parse device IDs: "1" / "0,1" / "cuda:3"
234
+ teacher_dev = getattr(args_obj, "teacher_device_ids", None)
235
+ teacher_dev = self._parse_teacher_devices(teacher_dev)
236
+
237
+ # Determine teacher dtype
238
+ dtype_map = {
239
+ "float16": torch.float16,
240
+ "bfloat16": torch.bfloat16,
241
+ "float32": torch.float32,
242
+ }
243
+ teacher_dtype = dtype_map.get(args_obj.teacher_dtype, torch.bfloat16)
244
+
245
+ print(f"🖥️ Teacher target device(s): {teacher_dev}")
246
+
247
+ # Case A: multi-GPU teacher
248
+ if isinstance(teacher_dev, list):
249
+ print(f"📡 Using multi-GPU teacher (HF auto parallel): {teacher_dev}")
250
+
251
+ # Let HuggingFace accelerate handle device distribution automatically
252
+ self.teacher_model = AutoModelForCausalLM.from_pretrained(
253
+ args_obj.teacher_model_path,
254
+ torch_dtype=teacher_dtype,
255
+ trust_remote_code=True,
256
+ device_map="auto", # Let accelerate decide optimal sharding
257
+ )
258
+
259
+ # Case B: single GPU teacher
260
+ else:
261
+ device_str = teacher_dev
262
+ print(f"📡 Loading teacher on single GPU: {device_str}")
263
+
264
+ # Load to CPU first, then move to target GPU
265
+ self.teacher_model = AutoModelForCausalLM.from_pretrained(
266
+ args_obj.teacher_model_path,
267
+ torch_dtype=teacher_dtype,
268
+ trust_remote_code=True,
269
+ )
270
+
271
+ # Move whole model to target GPU
272
+ self.teacher_model.to(device_str)
273
+
274
+ # Freeze teacher model
275
+ self.teacher_model.eval()
276
+ for param in self.teacher_model.parameters():
277
+ param.requires_grad = False
278
+
279
+ return teacher_dev
280
+
281
+ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
282
+ mode = "train" if self.model.training else "eval"
283
+
284
+ # Only rank 0 logs
285
+ if torch.distributed.is_available() and torch.distributed.is_initialized():
286
+ if torch.distributed.get_rank() != 0:
287
+ return
288
+
289
+ # Add avg_weight if exists
290
+ if self.use_entropy_weighting:
291
+ if 'avg_weight' in self._metrics[mode] and len(self._metrics[mode]['avg_weight']) > 0:
292
+ logs["avg_weight"] = sum(self._metrics[mode]['avg_weight']) / len(self._metrics[mode]['avg_weight'])
293
+ self._metrics[mode]['avg_weight'] = []
294
+
295
+ # VERY IMPORTANT:
296
+ # Call HF Trainer.log instead of TRL SFTTrainer.log
297
+ transformers.Trainer.log(self, logs)
298
+
299
+ def compute_entropy(self, logits: torch.Tensor, top_k: int = None) -> torch.Tensor:
300
+ """
301
+ Compute per-token entropy from logits (supports top-k approximation for memory efficiency).
302
+
303
+ Args:
304
+ logits: Model logits of shape [batch_size, seq_len, vocab_size]
305
+ top_k: If not None and < vocab_size, compute entropy using only top-k tokens.
306
+ This significantly reduces memory usage with minimal impact on weight quality.
307
+ Default uses self.entropy_top_k (set to 64 by default).
308
+
309
+ Returns:
310
+ entropy: Per-token entropy of shape [batch_size, seq_len]
311
+ """
312
+ # Use instance's top_k setting if not explicitly provided
313
+ if top_k is None:
314
+ top_k = getattr(self, 'entropy_top_k', 64)
315
+
316
+ with torch.no_grad():
317
+ # Use float32 for numerical stability
318
+ logits = logits.float()
319
+
320
+ vocab_size = logits.size(-1)
321
+
322
+ if top_k is not None and top_k < vocab_size:
323
+ # Top-K approximation: only compute entropy on top-k tokens
324
+ # This reduces memory from O(vocab_size) to O(k)
325
+ # Memory: ~151665 -> ~64, ~2400x reduction!
326
+ top_values, _ = torch.topk(logits, k=top_k, dim=-1) # [..., top_k]
327
+
328
+ log_probs = torch.log_softmax(top_values, dim=-1) # [..., top_k]
329
+ probs = torch.softmax(top_values, dim=-1) # [..., top_k]
330
+
331
+ entropy = -(probs * log_probs).sum(dim=-1) # [batch_size, seq_len]
332
+ else:
333
+ # Exact entropy computation (uses full vocab)
334
+ log_probs = torch.log_softmax(logits, dim=-1)
335
+ probs = torch.softmax(logits, dim=-1)
336
+ entropy = -(probs * log_probs).sum(dim=-1)
337
+
338
+ return entropy
339
+
340
+ def compute_entropy_topk(self, logits_topk: torch.Tensor) -> torch.Tensor:
341
+ """
342
+ Compute entropy from pre-extracted top-k logits (ultra memory-efficient).
343
+
344
+ Args:
345
+ logits_topk: Top-k logits of shape [batch_size, seq_len, k]
346
+
347
+ Returns:
348
+ entropy: Per-token entropy of shape [batch_size, seq_len]
349
+ """
350
+ with torch.no_grad():
351
+ # logits_topk is already top-k, just compute softmax on it
352
+ logits_topk = logits_topk.float()
353
+
354
+ log_probs = torch.log_softmax(logits_topk, dim=-1)
355
+ probs = torch.softmax(logits_topk, dim=-1)
356
+
357
+ entropy = -(probs * log_probs).sum(dim=-1)
358
+
359
+ return entropy
360
+
361
+ def compute_entropy_weights_topk(
362
+ self,
363
+ student_logits_topk: torch.Tensor,
364
+ teacher_logits_topk: torch.Tensor,
365
+ labels: torch.Tensor,
366
+ ) -> torch.Tensor:
367
+ """
368
+ Compute per-token weights using pre-extracted top-k logits (optimized version).
369
+
370
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
371
+
372
+ Args:
373
+ student_logits_topk: Student top-k logits [batch_size, seq_len, k]
374
+ teacher_logits_topk: Teacher top-k logits [batch_size, seq_len, k]
375
+ labels: Ground truth labels [batch_size, seq_len]
376
+
377
+ Returns:
378
+ weights: Per-token weights [batch_size, seq_len]
379
+ """
380
+ # Compute entropies from top-k logits (no additional memory allocation)
381
+ H_s = self.compute_entropy_topk(student_logits_topk) # [batch_size, seq_len]
382
+ H_t = self.compute_entropy_topk(teacher_logits_topk) # [batch_size, seq_len]
383
+
384
+ # Compute entropy difference
385
+ delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
386
+
387
+ # Component 1: exp(-H_t) - emphasizes positions where teacher is confident
388
+ teacher_confidence = torch.exp(-H_t)
389
+
390
+ # Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
391
+ disagreement_weight = torch.nn.functional.softplus(
392
+ self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
393
+ )
394
+
395
+ # Combined weight
396
+ weights = teacher_confidence * disagreement_weight
397
+
398
+ # Mask out positions where labels are -100 (padding or prompt tokens)
399
+ mask = (labels != -100).float()
400
+ weights = weights * mask
401
+
402
+ return weights
403
+
404
+ def compute_entropy_weights(
405
+ self,
406
+ student_logits: torch.Tensor,
407
+ teacher_logits: torch.Tensor,
408
+ labels: torch.Tensor,
409
+ ) -> torch.Tensor:
410
+ """
411
+ Compute per-token weights based on teacher-student entropy differences.
412
+
413
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
414
+
415
+ Args:
416
+ student_logits: Student model logits [batch_size, seq_len, vocab_size]
417
+ teacher_logits: Teacher model logits [batch_size, seq_len, vocab_size]
418
+ labels: Ground truth labels [batch_size, seq_len]
419
+
420
+ Returns:
421
+ weights: Per-token weights [batch_size, seq_len]
422
+ """
423
+ # Compute entropies
424
+ H_s = self.compute_entropy(student_logits) # [batch_size, seq_len]
425
+ H_t = self.compute_entropy(teacher_logits) # [batch_size, seq_len]
426
+
427
+ # Compute entropy difference
428
+ delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
429
+
430
+ # Component 1: exp(-H_t) - emphasizes positions where teacher is confident
431
+ teacher_confidence = torch.exp(-H_t)
432
+
433
+ # Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
434
+ disagreement_weight = torch.sigmoid(
435
+ self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
436
+ )
437
+
438
+ # Combined weight
439
+ weights = teacher_confidence * disagreement_weight
440
+
441
+ # Mask out positions where labels are -100 (padding or prompt tokens)
442
+ mask = (labels != -100).float()
443
+ weights = weights * mask
444
+
445
+ return weights
446
+
447
+ def compute_loss(
448
+ self,
449
+ model: nn.Module,
450
+ inputs: Dict[str, torch.Tensor | Any],
451
+ return_outputs: bool = False,
452
+ num_items_in_batch: torch.Tensor | None = None,
453
+ ):
454
+ """
455
+ Compute weighted loss with teacher-student entropy weighting.
456
+
457
+ This method:
458
+ 1. Computes student model forward pass
459
+ 2. If teacher available, computes teacher forward pass (no grad)
460
+ 3. Calculates per-token weights based on entropy differences
461
+ 4. Applies weights to the loss
462
+ """
463
+ # Standard loss computation from parent class (no entropy weighting)
464
+ if not self.use_entropy_weighting or self.teacher_model is None:
465
+ return super().compute_loss(model, inputs, return_outputs, num_items_in_batch)
466
+
467
+ # Get labels - SFTTrainer should have already set this up
468
+ if "labels" not in inputs:
469
+ raise ValueError(
470
+ "Expected 'labels' in inputs but not found. This usually means your data collator "
471
+ "is not properly configured. Please ensure you're using the correct collator."
472
+ )
473
+ labels = inputs["labels"]
474
+
475
+ # ===== Entropy-weighted loss computation =====
476
+
477
+ # 1. Student forward pass (with gradient)
478
+ inputs["use_cache"] = False
479
+ outputs = model(**inputs)
480
+ student_logits = outputs.logits # [batch_size, seq_len, vocab_size]
481
+
482
+ # 2. Teacher forward pass (no gradient)
483
+ with torch.no_grad():
484
+ # Move inputs to teacher device for parallel computation
485
+ teacher_input_ids = inputs["input_ids"]
486
+ teacher_attention_mask = inputs.get("attention_mask", None)
487
+
488
+ # Get teacher's primary device
489
+ if isinstance(self.teacher_device, list):
490
+ primary_teacher_device = self.teacher_device[0]
491
+ else:
492
+ primary_teacher_device = self.teacher_device
493
+
494
+ # Move to teacher device if different from student
495
+ if str(primary_teacher_device) != str(teacher_input_ids.device):
496
+ teacher_input_ids = teacher_input_ids.to(primary_teacher_device)
497
+ if teacher_attention_mask is not None:
498
+ teacher_attention_mask = teacher_attention_mask.to(primary_teacher_device)
499
+
500
+ # Prepare teacher inputs - explicitly pass only what's needed
501
+ teacher_outputs = self.teacher_model(
502
+ input_ids=teacher_input_ids,
503
+ attention_mask=teacher_attention_mask,
504
+ use_cache=False, # Explicitly disable cache for training
505
+ )
506
+ teacher_logits_full = teacher_outputs.logits # [batch_size, seq_len, vocab_size] on teacher device
507
+
508
+ # CRITICAL OPTIMIZATION: Shift FIRST, then top-k
509
+ # This saves 1/seq_len of memory compared to top-k then shift
510
+ shift_teacher_logits_full = teacher_logits_full[..., :-1, :].contiguous()
511
+ del teacher_logits_full # Free full logits immediately
512
+
513
+ # Now extract top-k from shifted teacher logits
514
+ # Reduces from [B, T-1, 151k] to [B, T-1, 64] - 2400x reduction!
515
+ top_k = getattr(self, 'entropy_top_k', 64)
516
+ shift_teacher_logits_topk, _ = torch.topk(
517
+ shift_teacher_logits_full, k=top_k, dim=-1
518
+ ) # [batch_size, seq_len-1, top_k] on teacher device
519
+
520
+ del shift_teacher_logits_full # Free shifted full logits
521
+
522
+ # Move teacher logits back to student device for weight computation
523
+ if str(primary_teacher_device) != str(student_logits.device):
524
+ shift_teacher_logits_topk = shift_teacher_logits_topk.to(student_logits.device)
525
+
526
+ # 3. Compute entropy-based weights
527
+ # Shift student logits and labels
528
+ shift_student_logits = student_logits[..., :-1, :].contiguous() # Full vocab for CE loss
529
+ shift_labels = labels[..., 1:].contiguous()
530
+
531
+ # OPTIMIZATION 2: Extract top-k from student logits for entropy computation only
532
+ # Student still needs full logits for CE loss, but we can use top-k for entropy
533
+ with torch.no_grad():
534
+ top_k = getattr(self, 'entropy_top_k', 64)
535
+ student_logits_topk, student_indices_topk = torch.topk(
536
+ shift_student_logits, k=top_k, dim=-1
537
+ ) # [batch_size, seq_len-1, top_k]
538
+
539
+ # Calculate weights using top-k versions (massive memory savings)
540
+ weights = self.compute_entropy_weights_topk(
541
+ student_logits_topk, # [B, T-1, k]
542
+ shift_teacher_logits_topk, # [B, T-1, k]
543
+ shift_labels,
544
+ ) # [batch_size, seq_len-1]
545
+
546
+ # 4. Compute weighted cross-entropy loss
547
+ # Flatten for loss computation
548
+ shift_student_logits_flat = shift_student_logits.view(-1, shift_student_logits.size(-1))
549
+ shift_labels_flat = shift_labels.view(-1)
550
+ weights_flat = weights.view(-1)
551
+
552
+ # Compute per-token cross-entropy (no reduction)
553
+ loss_fct = nn.CrossEntropyLoss(reduction='none')
554
+ per_token_loss = loss_fct(shift_student_logits_flat, shift_labels_flat)
555
+
556
+ # Apply weights
557
+ weighted_loss = per_token_loss * weights_flat
558
+
559
+ # Create proper mask considering both labels and attention_mask
560
+ if "attention_mask" in inputs:
561
+ # Shift attention mask to align with shifted labels
562
+ shift_attention_mask = inputs["attention_mask"][..., 1:].contiguous()
563
+ valid_mask = (shift_labels != -100) & (shift_attention_mask == 1)
564
+ else:
565
+ # Fallback if no attention mask (e.g., padding-free training)
566
+ valid_mask = (shift_labels != -100)
567
+
568
+ valid_mask_flat = valid_mask.view(-1).float()
569
+
570
+ # Compute final loss: normalize by sum of weights (not by token count)
571
+ # CRITICAL: Normalizing by weight_sum instead of token count ensures that
572
+ # the relative importance defined by weights is actually respected.
573
+ # If we normalized by token count, small-weight tokens would still contribute
574
+ # equally in the denominator, defeating the purpose of weighting.
575
+ weighted_loss_masked = weighted_loss * valid_mask_flat
576
+ weight_sum = (weights_flat * valid_mask_flat).sum()
577
+
578
+ loss = weighted_loss_masked.sum() / weight_sum.clamp(min=1e-8)
579
+
580
+ # 5. Handle auxiliary loss if present (e.g., MoE)
581
+ if self.aux_loss_enabled and hasattr(outputs, 'aux_loss'):
582
+ loss = loss + outputs.aux_loss
583
+
584
+ # Log statistics (for monitoring) - simplified to avoid redundant entropy computation
585
+ if self.model.training:
586
+ with torch.no_grad():
587
+ # Only log weight statistics to avoid redundant entropy computation
588
+ # (entropy was already computed in compute_entropy_weights above)
589
+ weight_mean = weights[shift_labels != -100].mean().item()
590
+ self._metrics['train']['avg_weight'].append(weight_mean)
591
+
592
+ # Optional: Can enable these if you need entropy logging
593
+ # but note this will recompute entropy and use extra memory
594
+ # H_s_mean = self.compute_entropy(shift_student_logits).mean().item()
595
+ # H_t_mean = self.compute_entropy(shift_teacher_logits).mean().item()
596
+ # self._metrics['train']['student_entropy'].append(H_s_mean)
597
+ # self._metrics['train']['teacher_entropy'].append(H_t_mean)
598
+
599
+ return (loss, outputs) if return_outputs else loss
train/test_1.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020-2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Entropy-weighted Trainer (ETrainer) for knowledge distillation with dynamic loss weighting.
17
+ Based on TRL's SFTTrainer with modifications for teacher-student entropy-based weighting.
18
+ """
19
+
20
+ import torch
21
+ import torch.nn as nn
22
+ from typing import Any, Dict
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer
24
+ from trl import SFTTrainer
25
+ from dataclasses import dataclass, field
26
+ from trl import SFTConfig
27
+ import transformers
28
+
29
+
30
+ @dataclass
31
+ class ETrainerConfig(SFTConfig):
32
+ """
33
+ Extended SFTConfig with teacher model parameters for entropy-weighted training.
34
+
35
+ Args:
36
+ teacher_model_path (`str`, *optional*):
37
+ Path to the teacher model (e.g., "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B").
38
+ If None, no entropy weighting is applied.
39
+ entropy_weight_alpha (`float`, defaults to 2.0):
40
+ Alpha parameter controlling the sensitivity of entropy difference weighting.
41
+ Higher values make the weighting more sensitive to entropy differences.
42
+ entropy_weight_beta (`float`, defaults to 0.3):
43
+ Beta parameter controlling the zero-point offset for entropy difference.
44
+ Determines the threshold for significant entropy differences.
45
+ use_entropy_weighting (`bool`, defaults to True):
46
+ Whether to apply entropy-based loss weighting. Set to False to use standard loss.
47
+ teacher_dtype (`str`, defaults to "bfloat16"):
48
+ Data type for teacher model ("float16", "bfloat16", or "float32").
49
+ entropy_top_k (`int`, defaults to 64):
50
+ Number of top tokens to use for entropy approximation. Set to None or vocab_size
51
+ to compute exact entropy. Smaller values save memory significantly.
52
+ """
53
+ teacher_model_path: str | None = field(
54
+ default=None,
55
+ metadata={"help": "Path to teacher model for entropy weighting"}
56
+ )
57
+ entropy_weight_alpha: float = field(
58
+ default=2.0,
59
+ metadata={"help": "Alpha parameter for entropy weighting sensitivity"}
60
+ )
61
+ entropy_weight_beta: float = field(
62
+ default=0.3,
63
+ metadata={"help": "Beta parameter for entropy weighting offset"}
64
+ )
65
+ use_entropy_weighting: bool = field(
66
+ default=True,
67
+ metadata={"help": "Whether to apply entropy-based loss weighting"}
68
+ )
69
+ teacher_dtype: str = field(
70
+ default="bfloat16",
71
+ metadata={"help": "Teacher model dtype (float16/bfloat16/float32)"}
72
+ )
73
+ entropy_top_k: int = field(
74
+ default=64,
75
+ metadata={"help": "Top-K tokens for entropy approximation (reduces memory)"}
76
+ )
77
+ teacher_device_ids: str | None = field(
78
+ default=None,
79
+ metadata={
80
+ "help": "GPU device IDs for teacher model (e.g., '0', '0,1', 'cuda:2'). "
81
+ "If None, uses same device as student. Separate GPUs enable parallel computation."
82
+ }
83
+ )
84
+
85
+
86
+ class ETrainer(SFTTrainer):
87
+ """
88
+ Entropy-weighted Trainer for knowledge distillation.
89
+
90
+ This trainer extends SFTTrainer by adding dynamic per-token loss weighting based on
91
+ teacher-student entropy differences. The weight formula is:
92
+
93
+ w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
94
+
95
+ Where:
96
+ - H_t(j): Teacher's entropy at position j (lower = more confident)
97
+ - H_s(j): Student's entropy at position j
98
+ - α: Sensitivity parameter (default 2.0)
99
+ - β: Offset parameter (default 0.3)
100
+
101
+ Key features:
102
+ - Higher weight when teacher is confident (low H_t)
103
+ - Higher weight when student differs significantly from teacher (large |ΔH|)
104
+ - Smooth weighting via sigmoid to avoid gradient explosion
105
+ """
106
+
107
+ def __init__(self, *args, **kwargs):
108
+ # Extract ETrainer-specific arguments before passing to parent
109
+ args_obj = kwargs.get('args', None)
110
+
111
+ # Initialize parent SFTTrainer
112
+ super().__init__(*args, **kwargs)
113
+
114
+ # Load teacher model if specified
115
+ self.teacher_model = None
116
+ self.use_entropy_weighting = False
117
+
118
+ # Initialize entropy-specific metrics
119
+ # Note: Only initialize metrics we actually populate to avoid ZeroDivisionError
120
+ if not hasattr(self, '_metrics'):
121
+ self._metrics = {'train': {}, 'eval': {}}
122
+
123
+ # Add entropy-specific metric keys
124
+ for mode in ['train', 'eval']:
125
+ if mode not in self._metrics:
126
+ self._metrics[mode] = {}
127
+ # Only initialize avg_weight since we're not logging entropy to save memory
128
+ self._metrics[mode].setdefault('avg_weight', [])
129
+
130
+ if args_obj and hasattr(args_obj, 'teacher_model_path') and args_obj.teacher_model_path:
131
+ self.use_entropy_weighting = args_obj.use_entropy_weighting
132
+ self.entropy_weight_alpha = args_obj.entropy_weight_alpha
133
+ self.entropy_weight_beta = args_obj.entropy_weight_beta
134
+ self.entropy_top_k = getattr(args_obj, 'entropy_top_k', 64)
135
+
136
+ if self.use_entropy_weighting:
137
+ # Check if we're in distributed training
138
+ is_main_process = True
139
+ if torch.distributed.is_available() and torch.distributed.is_initialized():
140
+ is_main_process = torch.distributed.get_rank() == 0
141
+
142
+ if is_main_process:
143
+ print(f"🎓 Loading teacher model: {args_obj.teacher_model_path}")
144
+ print(f"📊 Entropy weighting params: α={self.entropy_weight_alpha}, β={self.entropy_weight_beta}")
145
+ print(f"💾 Entropy computation: top-k={self.entropy_top_k} (memory-efficient mode)")
146
+
147
+ # Load teacher model to specified device(s)
148
+ self.teacher_device = self._load_teacher_model(args_obj)
149
+
150
+ if is_main_process:
151
+ print("✅ Teacher model loaded and frozen")
152
+
153
+ # Freeze teacher model
154
+ for param in self.teacher_model.parameters():
155
+ param.requires_grad = False
156
+
157
+ if is_main_process:
158
+ print("✅ Teacher model loaded and frozen")
159
+
160
+ # --- Fix teacher tokenizer/model vocab mismatch ---
161
+ print("\n🔍 Checking teacher tokenizer/model alignment...")
162
+
163
+ teacher_tokenizer = AutoTokenizer.from_pretrained(
164
+ args_obj.teacher_model_path,
165
+ trust_remote_code=True
166
+ )
167
+
168
+ tokenizer_vocab = len(teacher_tokenizer)
169
+ model_vocab = self.teacher_model.config.vocab_size
170
+
171
+ if is_main_process:
172
+ print(f"📌 Teacher tokenizer vocab: {tokenizer_vocab}")
173
+ print(f"📌 Teacher model vocab: {model_vocab}")
174
+
175
+ if tokenizer_vocab != model_vocab:
176
+ if is_main_process:
177
+ print("⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...")
178
+ self.teacher_model.resize_token_embeddings(tokenizer_vocab)
179
+ self.teacher_model.config.vocab_size = tokenizer_vocab
180
+ if is_main_process:
181
+ print(f"✅ Teacher embeddings resized to {tokenizer_vocab}")
182
+ else:
183
+ if is_main_process:
184
+ print("✅ Teacher tokenizer and model vocab already match")
185
+
186
+ # --- Now align student with teacher ---
187
+ if is_main_process:
188
+ print(f"\n📊 Student model vocab size: {self.model.config.vocab_size}")
189
+ print(f"📊 Teacher model vocab size (after alignment): {self.teacher_model.config.vocab_size}")
190
+
191
+ # Handle vocab size mismatch by resizing student embeddings to match teacher
192
+ if self.teacher_model.config.vocab_size != self.model.config.vocab_size:
193
+ if is_main_process:
194
+ print(f"\n⚠️ Student/Teacher vocab size mismatch detected!")
195
+ print(f" Teacher: {self.teacher_model.config.vocab_size}")
196
+ print(f" Student: {self.model.config.vocab_size}")
197
+ print(f"🔧 Resizing student embeddings to match teacher...")
198
+
199
+ # Resize student model embeddings (new tokens initialized with mean of existing)
200
+ self.model.resize_token_embeddings(self.teacher_model.config.vocab_size)
201
+
202
+ if is_main_process:
203
+ print(f"✅ Student embeddings resized to {self.teacher_model.config.vocab_size}")
204
+ print(f" New student vocab size: {self.model.config.vocab_size}")
205
+ else:
206
+ if is_main_process:
207
+ print(f"✅ Student and teacher vocab sizes already match: {self.model.config.vocab_size}")
208
+
209
+ if is_main_process:
210
+ print("\n" + "="*60)
211
+ print(f"🎯 Final Vocab Alignment Complete")
212
+ print(f"📊 Teacher vocab size: {self.teacher_model.config.vocab_size}")
213
+ print(f"📊 Student vocab size: {self.model.config.vocab_size}")
214
+ print("="*60)
215
+
216
+ def _parse_teacher_devices(self, teacher_device_ids: str | None) -> str | list:
217
+ """
218
+ Parse teacher device IDs string into device specification.
219
+
220
+ Args:
221
+ teacher_device_ids: Device string like "0", "0,1", "cuda:2", or None
222
+
223
+ Returns:
224
+ Single device string (e.g., "cuda:0") or list of devices for multi-GPU
225
+ """
226
+ if teacher_device_ids is None:
227
+ # Default: use cuda:0 if available
228
+ return "cuda:0" if torch.cuda.is_available() else "cpu"
229
+
230
+ # Clean up the input
231
+ teacher_device_ids = teacher_device_ids.strip()
232
+
233
+ # Handle comma-separated multi-GPU case
234
+ if ',' in teacher_device_ids:
235
+ device_list = [f"cuda:{id.strip()}" if not id.strip().startswith('cuda:')
236
+ else id.strip()
237
+ for id in teacher_device_ids.split(',')]
238
+ return device_list
239
+
240
+ # Single device case
241
+ if not teacher_device_ids.startswith('cuda:'):
242
+ return f"cuda:{teacher_device_ids}"
243
+ return teacher_device_ids
244
+
245
+ def _load_teacher_model(self, args_obj):
246
+ """
247
+ Load teacher model to specified device(s).
248
+
249
+ This method handles both single-GPU and multi-GPU teacher configurations.
250
+ """
251
+ # Check if we're in distributed training
252
+ is_main_process = True
253
+ if torch.distributed.is_available() and torch.distributed.is_initialized():
254
+ is_main_process = torch.distributed.get_rank() == 0
255
+
256
+ # Parse device IDs: "1" / "0,1" / "cuda:3"
257
+ teacher_dev = getattr(args_obj, "teacher_device_ids", None)
258
+ teacher_dev = self._parse_teacher_devices(teacher_dev)
259
+
260
+ # Determine teacher dtype
261
+ dtype_map = {
262
+ "float16": torch.float16,
263
+ "bfloat16": torch.bfloat16,
264
+ "float32": torch.float32,
265
+ }
266
+ teacher_dtype = dtype_map.get(args_obj.teacher_dtype, torch.bfloat16)
267
+
268
+ if is_main_process:
269
+ print(f"🖥️ Teacher target device(s): {teacher_dev}")
270
+
271
+ # Case A: multi-GPU teacher
272
+ if isinstance(teacher_dev, list):
273
+ if is_main_process:
274
+ print(f"📡 Using multi-GPU teacher (HF auto parallel): {teacher_dev}")
275
+
276
+ # Let HuggingFace accelerate handle device distribution automatically
277
+ self.teacher_model = AutoModelForCausalLM.from_pretrained(
278
+ args_obj.teacher_model_path,
279
+ torch_dtype=teacher_dtype,
280
+ trust_remote_code=True,
281
+ device_map="auto", # Let accelerate decide optimal sharding
282
+ )
283
+
284
+ # Case B: single GPU teacher
285
+ else:
286
+ device_str = teacher_dev
287
+ if is_main_process:
288
+ print(f"📡 Loading teacher on single GPU: {device_str}")
289
+
290
+ # Load to CPU first, then move to target GPU
291
+ self.teacher_model = AutoModelForCausalLM.from_pretrained(
292
+ args_obj.teacher_model_path,
293
+ torch_dtype=teacher_dtype,
294
+ trust_remote_code=True,
295
+ )
296
+
297
+ # Move whole model to target GPU
298
+ self.teacher_model.to(device_str)
299
+
300
+ # Freeze teacher model
301
+ self.teacher_model.eval()
302
+ for param in self.teacher_model.parameters():
303
+ param.requires_grad = False
304
+
305
+ return teacher_dev
306
+
307
+ def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
308
+ mode = "train" if self.model.training else "eval"
309
+
310
+ # Only rank 0 logs
311
+ if torch.distributed.is_available() and torch.distributed.is_initialized():
312
+ if torch.distributed.get_rank() != 0:
313
+ return
314
+
315
+ # Add avg_weight if exists
316
+ if self.use_entropy_weighting:
317
+ if 'avg_weight' in self._metrics[mode] and len(self._metrics[mode]['avg_weight']) > 0:
318
+ logs["avg_weight"] = sum(self._metrics[mode]['avg_weight']) / len(self._metrics[mode]['avg_weight'])
319
+ self._metrics[mode]['avg_weight'] = []
320
+
321
+ # VERY IMPORTANT:
322
+ # Call HF Trainer.log instead of TRL SFTTrainer.log
323
+ transformers.Trainer.log(self, logs)
324
+
325
+ def compute_entropy(self, logits: torch.Tensor, top_k: int = None) -> torch.Tensor:
326
+ """
327
+ Compute per-token entropy from logits (supports top-k approximation for memory efficiency).
328
+
329
+ Args:
330
+ logits: Model logits of shape [batch_size, seq_len, vocab_size]
331
+ top_k: If not None and < vocab_size, compute entropy using only top-k tokens.
332
+ This significantly reduces memory usage with minimal impact on weight quality.
333
+ Default uses self.entropy_top_k (set to 64 by default).
334
+
335
+ Returns:
336
+ entropy: Per-token entropy of shape [batch_size, seq_len]
337
+ """
338
+ # Use instance's top_k setting if not explicitly provided
339
+ if top_k is None:
340
+ top_k = getattr(self, 'entropy_top_k', 64)
341
+
342
+ with torch.no_grad():
343
+ # Use float32 for numerical stability
344
+ logits = logits.float()
345
+
346
+ vocab_size = logits.size(-1)
347
+
348
+ if top_k is not None and top_k < vocab_size:
349
+ # Top-K approximation: only compute entropy on top-k tokens
350
+ # This reduces memory from O(vocab_size) to O(k)
351
+ # Memory: ~151665 -> ~64, ~2400x reduction!
352
+ top_values, _ = torch.topk(logits, k=top_k, dim=-1) # [..., top_k]
353
+
354
+ log_probs = torch.log_softmax(top_values, dim=-1) # [..., top_k]
355
+ probs = torch.softmax(top_values, dim=-1) # [..., top_k]
356
+
357
+ entropy = -(probs * log_probs).sum(dim=-1) # [batch_size, seq_len]
358
+ else:
359
+ # Exact entropy computation (uses full vocab)
360
+ log_probs = torch.log_softmax(logits, dim=-1)
361
+ probs = torch.softmax(logits, dim=-1)
362
+ entropy = -(probs * log_probs).sum(dim=-1)
363
+
364
+ return entropy
365
+
366
+ def compute_entropy_topk(self, logits_topk: torch.Tensor) -> torch.Tensor:
367
+ """
368
+ Compute entropy from pre-extracted top-k logits (ultra memory-efficient).
369
+
370
+ Args:
371
+ logits_topk: Top-k logits of shape [batch_size, seq_len, k]
372
+
373
+ Returns:
374
+ entropy: Per-token entropy of shape [batch_size, seq_len]
375
+ """
376
+ with torch.no_grad():
377
+ # logits_topk is already top-k, just compute softmax on it
378
+ logits_topk = logits_topk.float()
379
+
380
+ log_probs = torch.log_softmax(logits_topk, dim=-1)
381
+ probs = torch.softmax(logits_topk, dim=-1)
382
+
383
+ entropy = -(probs * log_probs).sum(dim=-1)
384
+
385
+ return entropy
386
+
387
+ def compute_entropy_weights_topk(
388
+ self,
389
+ student_logits_topk: torch.Tensor,
390
+ teacher_logits_topk: torch.Tensor,
391
+ labels: torch.Tensor,
392
+ ) -> torch.Tensor:
393
+ """
394
+ Compute per-token weights using pre-extracted top-k logits (optimized version).
395
+
396
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
397
+
398
+ Args:
399
+ student_logits_topk: Student top-k logits [batch_size, seq_len, k]
400
+ teacher_logits_topk: Teacher top-k logits [batch_size, seq_len, k]
401
+ labels: Ground truth labels [batch_size, seq_len]
402
+
403
+ Returns:
404
+ weights: Per-token weights [batch_size, seq_len]
405
+ """
406
+ # Compute entropies from top-k logits (no additional memory allocation)
407
+ H_s = self.compute_entropy_topk(student_logits_topk) # [batch_size, seq_len]
408
+ H_t = self.compute_entropy_topk(teacher_logits_topk) # [batch_size, seq_len]
409
+
410
+ # Compute entropy difference
411
+ delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
412
+
413
+ # Component 1: exp(-H_t) - emphasizes positions where teacher is confident
414
+ teacher_confidence = torch.exp(-H_t)
415
+
416
+ # Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
417
+ disagreement_weight = torch.nn.functional.softplus(
418
+ self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
419
+ )
420
+
421
+ # Combined weight
422
+ weights = teacher_confidence * disagreement_weight
423
+
424
+ # Mask out positions where labels are -100 (padding or prompt tokens)
425
+ mask = (labels != -100).float()
426
+ weights = weights * mask
427
+
428
+ return weights
429
+
430
+ def compute_entropy_weights(
431
+ self,
432
+ student_logits: torch.Tensor,
433
+ teacher_logits: torch.Tensor,
434
+ labels: torch.Tensor,
435
+ ) -> torch.Tensor:
436
+ """
437
+ Compute per-token weights based on teacher-student entropy differences.
438
+
439
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
440
+
441
+ Args:
442
+ student_logits: Student model logits [batch_size, seq_len, vocab_size]
443
+ teacher_logits: Teacher model logits [batch_size, seq_len, vocab_size]
444
+ labels: Ground truth labels [batch_size, seq_len]
445
+
446
+ Returns:
447
+ weights: Per-token weights [batch_size, seq_len]
448
+ """
449
+ # Compute entropies
450
+ H_s = self.compute_entropy(student_logits) # [batch_size, seq_len]
451
+ H_t = self.compute_entropy(teacher_logits) # [batch_size, seq_len]
452
+
453
+ # Compute entropy difference
454
+ delta_H = torch.abs(H_s - H_t) # [batch_size, seq_len]
455
+
456
+ # Component 1: exp(-H_t) - emphasizes positions where teacher is confident
457
+ teacher_confidence = torch.exp(-H_t)
458
+
459
+ # Component 2: sigmoid(α * (|ΔH| - β)) - smooth weighting based on disagreement
460
+ disagreement_weight = torch.sigmoid(
461
+ self.entropy_weight_alpha * (delta_H - self.entropy_weight_beta)
462
+ )
463
+
464
+ # Combined weight
465
+ weights = teacher_confidence * disagreement_weight
466
+
467
+ # Mask out positions where labels are -100 (padding or prompt tokens)
468
+ mask = (labels != -100).float()
469
+ weights = weights * mask
470
+
471
+ return weights
472
+
473
+ def compute_loss(
474
+ self,
475
+ model: nn.Module,
476
+ inputs: Dict[str, torch.Tensor | Any],
477
+ return_outputs: bool = False,
478
+ num_items_in_batch: torch.Tensor | None = None,
479
+ ):
480
+ """
481
+ Compute weighted loss with teacher-student entropy weighting.
482
+
483
+ This method:
484
+ 1. Computes student model forward pass
485
+ 2. If teacher available, computes teacher forward pass (no grad)
486
+ 3. Calculates per-token weights based on entropy differences
487
+ 4. Applies weights to the loss
488
+ """
489
+ # Standard loss computation from parent class (no entropy weighting)
490
+ if not self.use_entropy_weighting or self.teacher_model is None:
491
+ return super().compute_loss(model, inputs, return_outputs, num_items_in_batch)
492
+
493
+ # Get labels - SFTTrainer should have already set this up
494
+ if "labels" not in inputs:
495
+ raise ValueError(
496
+ "Expected 'labels' in inputs but not found. This usually means your data collator "
497
+ "is not properly configured. Please ensure you're using the correct collator."
498
+ )
499
+ labels = inputs["labels"]
500
+
501
+ # ===== Entropy-weighted loss computation with PARALLEL execution =====
502
+
503
+ # Get teacher's primary device
504
+ if isinstance(self.teacher_device, list):
505
+ primary_teacher_device = self.teacher_device[0]
506
+ else:
507
+ primary_teacher_device = self.teacher_device
508
+
509
+ # Prepare teacher inputs (move to teacher device early)
510
+ teacher_input_ids = inputs["input_ids"].to(primary_teacher_device, non_blocking=True)
511
+ teacher_attention_mask = inputs.get("attention_mask", None)
512
+ if teacher_attention_mask is not None:
513
+ teacher_attention_mask = teacher_attention_mask.to(primary_teacher_device, non_blocking=True)
514
+
515
+ # Get student device
516
+ student_device = next(model.parameters()).device
517
+
518
+ # Create CUDA streams for parallel execution
519
+ student_stream = torch.cuda.Stream(device=student_device)
520
+ teacher_stream = torch.cuda.Stream(device=primary_teacher_device)
521
+
522
+ # Containers for outputs
523
+ student_outputs_container = [None]
524
+ teacher_logits_topk_container = [None]
525
+
526
+ # ===== PARALLEL EXECUTION BLOCK =====
527
+
528
+ # Containers for student top-k logits
529
+ student_logits_topk_container = [None]
530
+ shift_student_logits_container = [None]
531
+
532
+ # 1. Student forward pass (with gradient) on student stream
533
+ with torch.cuda.stream(student_stream):
534
+ inputs["use_cache"] = False
535
+ student_outputs_container[0] = model(**inputs)
536
+
537
+ # CRITICAL: Extract student logits and do shift + top-k in the same stream
538
+ # This avoids keeping full vocab logits in memory
539
+ student_logits_full = student_outputs_container[0].logits # [B, T, V]
540
+
541
+ # Shift student logits for loss computation (needs full vocab for CE)
542
+ shift_student_logits_container[0] = student_logits_full[..., :-1, :].contiguous()
543
+
544
+ # Extract top-k from shifted student logits for entropy (memory efficient)
545
+ with torch.no_grad():
546
+ top_k = getattr(self, 'entropy_top_k', 64)
547
+ student_logits_topk_container[0], _ = torch.topk(
548
+ shift_student_logits_container[0], k=top_k, dim=-1
549
+ ) # [B, T-1, k]
550
+
551
+ # 2. Teacher forward pass (no gradient) on teacher stream - RUNS IN PARALLEL!
552
+ with torch.cuda.stream(teacher_stream):
553
+ with torch.no_grad():
554
+ # Prepare teacher inputs - explicitly pass only what's needed
555
+ teacher_outputs = self.teacher_model(
556
+ input_ids=teacher_input_ids,
557
+ attention_mask=teacher_attention_mask,
558
+ use_cache=False, # Explicitly disable cache for training
559
+ )
560
+ teacher_logits_full = teacher_outputs.logits # [batch_size, seq_len, vocab_size] on teacher device
561
+
562
+ # CRITICAL MEMORY OPTIMIZATION: Extract top-k BEFORE shift to save memory
563
+ # This avoids creating a full [B, T, V] contiguous tensor
564
+ top_k = getattr(self, 'entropy_top_k', 64)
565
+
566
+ # Get top-k from full logits [B, T, V] -> [B, T, k]
567
+ teacher_logits_topk_full, _ = torch.topk(teacher_logits_full, k=top_k, dim=-1)
568
+
569
+ # Immediately delete full logits (free 4+ GB!)
570
+ del teacher_logits_full
571
+ del teacher_outputs # Also free the outputs object
572
+
573
+ # Now shift the much smaller top-k tensor [B, T, k] -> [B, T-1, k]
574
+ shift_teacher_logits_topk = teacher_logits_topk_full[..., :-1, :].contiguous()
575
+
576
+ # Free the unshifted top-k
577
+ del teacher_logits_topk_full
578
+
579
+ # Move to student device (asynchronously)
580
+ teacher_logits_topk_container[0] = shift_teacher_logits_topk.to(
581
+ student_device, non_blocking=True
582
+ )
583
+
584
+ # Explicitly delete to free teacher device memory immediately
585
+ del shift_teacher_logits_topk
586
+
587
+ # ===== SYNCHRONIZATION POINT =====
588
+ # Wait for both streams to complete before using their results
589
+ student_stream.synchronize()
590
+ teacher_stream.synchronize()
591
+
592
+ # Extract results from containers
593
+ outputs = student_outputs_container[0]
594
+ shift_teacher_logits_topk = teacher_logits_topk_container[0]
595
+ shift_student_logits = shift_student_logits_container[0]
596
+ student_logits_topk = student_logits_topk_container[0]
597
+
598
+ # 3. Compute entropy-based weights
599
+ # Shift labels
600
+ shift_labels = labels[..., 1:].contiguous()
601
+
602
+ # Calculate weights using top-k versions (massive memory savings)
603
+ weights = self.compute_entropy_weights_topk(
604
+ student_logits_topk, # [B, T-1, k]
605
+ shift_teacher_logits_topk, # [B, T-1, k]
606
+ shift_labels,
607
+ ) # [batch_size, seq_len-1]
608
+
609
+ # 4. Compute weighted cross-entropy loss
610
+ # Flatten for loss computation
611
+ shift_student_logits_flat = shift_student_logits.view(-1, shift_student_logits.size(-1))
612
+ shift_labels_flat = shift_labels.view(-1)
613
+ weights_flat = weights.view(-1)
614
+
615
+ # Compute per-token cross-entropy (no reduction)
616
+ loss_fct = nn.CrossEntropyLoss(reduction='none')
617
+ per_token_loss = loss_fct(shift_student_logits_flat, shift_labels_flat)
618
+
619
+ # Apply weights
620
+ weighted_loss = per_token_loss * weights_flat
621
+
622
+ # Create proper mask considering both labels and attention_mask
623
+ if "attention_mask" in inputs:
624
+ # Shift attention mask to align with shifted labels
625
+ shift_attention_mask = inputs["attention_mask"][..., 1:].contiguous()
626
+ valid_mask = (shift_labels != -100) & (shift_attention_mask == 1)
627
+ else:
628
+ # Fallback if no attention mask (e.g., padding-free training)
629
+ valid_mask = (shift_labels != -100)
630
+
631
+ valid_mask_flat = valid_mask.view(-1).float()
632
+
633
+ # Compute final loss: normalize by sum of weights (not by token count)
634
+ # CRITICAL: Normalizing by weight_sum instead of token count ensures that
635
+ # the relative importance defined by weights is actually respected.
636
+ # If we normalized by token count, small-weight tokens would still contribute
637
+ # equally in the denominator, defeating the purpose of weighting.
638
+ weighted_loss_masked = weighted_loss * valid_mask_flat
639
+ weight_sum = (weights_flat * valid_mask_flat).sum()
640
+
641
+ loss = weighted_loss_masked.sum() / weight_sum.clamp(min=1e-8)
642
+
643
+ # 5. Handle auxiliary loss if present (e.g., MoE)
644
+ if self.aux_loss_enabled and hasattr(outputs, 'aux_loss'):
645
+ loss = loss + outputs.aux_loss
646
+
647
+ # Log statistics (for monitoring) - simplified to avoid redundant entropy computation
648
+ if self.model.training:
649
+ with torch.no_grad():
650
+ # Only log weight statistics to avoid redundant entropy computation
651
+ # (entropy was already computed in compute_entropy_weights above)
652
+ weight_mean = weights[shift_labels != -100].mean().item()
653
+ self._metrics['train']['avg_weight'].append(weight_mean)
654
+
655
+ # Optional: Can enable these if you need entropy logging
656
+ # but note this will recompute entropy and use extra memory
657
+ # H_s_mean = self.compute_entropy(shift_student_logits).mean().item()
658
+ # H_t_mean = self.compute_entropy(shift_teacher_logits).mean().item()
659
+ # self._metrics['train']['student_entropy'].append(H_s_mean)
660
+ # self._metrics['train']['teacher_entropy'].append(H_t_mean)
661
+
662
+ return (loss, outputs) if return_outputs else loss
train/test_on_math.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ vLLM evaluation with correct parameter name: max_model_length
4
+ """
5
+ import os
6
+ import shutil
7
+ from datetime import timedelta
8
+
9
+ from lighteval.logging.evaluation_tracker import EvaluationTracker
10
+ from lighteval.models.vllm.vllm_model import VLLMModelConfig
11
+ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
12
+ from lighteval.utils.imports import is_package_available
13
+
14
+ if is_package_available("accelerate"):
15
+ from accelerate import Accelerator, InitProcessGroupKwargs
16
+ accelerator = Accelerator(
17
+ kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]
18
+ )
19
+ else:
20
+ accelerator = None
21
+
22
+
23
+ def setup_environment():
24
+ cc = shutil.which("x86_64-conda-linux-gnu-cc") or shutil.which("gcc")
25
+ cxx = shutil.which("x86_64-conda-linux-gnu-c++") or shutil.which("g++")
26
+ if cc:
27
+ os.environ["CC"] = cc
28
+ if cxx:
29
+ os.environ["CXX"] = cxx
30
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
31
+ print("✓ Environment configured")
32
+
33
+
34
+ def main():
35
+ setup_environment()
36
+ print("--- Starting vLLM evaluation ---")
37
+
38
+ # Use max_model_length (not max_model_len!)
39
+ MAX_MODEL_LENGTH = 40960 # 40k tokens to handle 33658
40
+
41
+ pipeline = Pipeline(
42
+ tasks="lighteval|math_500|0|1",
43
+ pipeline_parameters=PipelineParameters(
44
+ launcher_type=ParallelismManager.ACCELERATE if accelerator else ParallelismManager.NONE,
45
+ # max_samples=10, # For testing
46
+ ),
47
+ evaluation_tracker=EvaluationTracker(
48
+ output_dir="./results",
49
+ save_details=True,
50
+ ),
51
+ model_config=VLLMModelConfig(
52
+ model_name="Qwen/Qwen2.5-Math-7B-Instruct",
53
+ dtype="bfloat16",
54
+ tensor_parallel_size=1,
55
+ gpu_memory_utilization=0.92,
56
+ trust_remote_code=True,
57
+ max_model_length=MAX_MODEL_LENGTH, # ✅ Correct parameter name!
58
+ ),
59
+ )
60
+
61
+ print(f"📊 Config: max_model_length={MAX_MODEL_LENGTH}")
62
+
63
+ pipeline.evaluate()
64
+ pipeline.save_and_push_results()
65
+ pipeline.show_results()
66
+
67
+ print("✅ Done!")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
train/test_ood_python_lora_rope.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lighteval
2
+ from lighteval.logging.evaluation_tracker import EvaluationTracker
3
+ from lighteval.models.vllm.vllm_model import VLLMModelConfig
4
+ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
5
+ from lighteval.utils.imports import is_package_available
6
+ from peft import PeftModel
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
8
+ import os
9
+ import torch
10
+ import json
11
+
12
+ if is_package_available("accelerate"):
13
+ from datetime import timedelta
14
+ from accelerate import Accelerator, InitProcessGroupKwargs
15
+ accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
16
+ else:
17
+ accelerator = None
18
+
19
+ def merge_lora_if_needed():
20
+ """Merge LoRA model and preserve RoPE scaling configuration"""
21
+ merged_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora-Merged"
22
+
23
+ if os.path.exists(os.path.join(merged_path, "config.json")):
24
+ print(f"Merged model already exists at {merged_path}")
25
+
26
+ # Verify RoPE scaling in existing merged model
27
+ config_path = os.path.join(merged_path, "config.json")
28
+ with open(config_path, 'r') as f:
29
+ config = json.load(f)
30
+ if 'rope_scaling' in config:
31
+ print(f"✓ Existing merged model has RoPE scaling: {config['rope_scaling']}")
32
+ print(f"✓ Max position embeddings: {config.get('max_position_embeddings', 'N/A')}")
33
+ else:
34
+ print("⚠ Warning: Existing merged model does NOT have RoPE scaling config!")
35
+ print(" Deleting and re-creating with RoPE scaling...")
36
+ import shutil
37
+ shutil.rmtree(merged_path)
38
+ return merge_lora_if_needed() # Recursive call to re-create
39
+
40
+ return merged_path
41
+
42
+ print("="*100)
43
+ print("Merged model not found. Starting merge process...")
44
+ print("="*100)
45
+
46
+ lora_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora"
47
+
48
+ # Step 1: Load base model
49
+ print("\n[1/5] Loading base model...")
50
+ base_model = AutoModelForCausalLM.from_pretrained(
51
+ "Qwen/Qwen2.5-Math-1.5B",
52
+ torch_dtype=torch.bfloat16,
53
+ trust_remote_code=True,
54
+ device_map="auto"
55
+ )
56
+
57
+ # Step 2: Load LoRA adapter
58
+ print("\n[2/5] Loading LoRA adapter...")
59
+ model = PeftModel.from_pretrained(base_model, lora_path)
60
+
61
+ # Step 3: Merge and unload
62
+ print("\n[3/5] Merging LoRA weights with base model...")
63
+ merged_model = model.merge_and_unload()
64
+
65
+ # Step 4: Save merged model
66
+ print(f"\n[4/5] Saving merged model to {merged_path}...")
67
+ os.makedirs(merged_path, exist_ok=True)
68
+ merged_model.save_pretrained(merged_path, safe_serialization=True)
69
+
70
+ # Step 5: Add RoPE scaling configuration
71
+ print("\n[5/5] Adding RoPE scaling configuration...")
72
+ merged_config_path = os.path.join(merged_path, "config.json")
73
+ with open(merged_config_path, 'r') as f:
74
+ merged_config = json.load(f)
75
+
76
+ # ========== RoPE scaling: 4096 -> 8192, factor = 2.0 ==========
77
+ merged_config['rope_scaling'] = {
78
+ "type": "linear",
79
+ "factor": 2.0
80
+ }
81
+
82
+ print(f"✓ Added RoPE scaling: {merged_config['rope_scaling']}")
83
+
84
+ # 更新 max_position_embeddings 从 4096 到 8192
85
+ original_max_pos = merged_config.get('max_position_embeddings', 4096)
86
+ scaling_factor = merged_config['rope_scaling']['factor']
87
+ new_max_pos = int(original_max_pos * scaling_factor)
88
+ merged_config['max_position_embeddings'] = new_max_pos
89
+ print(f"✓ Updated max_position_embeddings: {original_max_pos} -> {new_max_pos}")
90
+
91
+ # Save updated config
92
+ with open(merged_config_path, 'w') as f:
93
+ json.dump(merged_config, f, indent=2, ensure_ascii=False)
94
+
95
+ # Save tokenizer
96
+ print("Saving tokenizer...")
97
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B", trust_remote_code=True)
98
+ tokenizer.save_pretrained(merged_path)
99
+
100
+ # Clean up memory
101
+ del base_model
102
+ del model
103
+ del merged_model
104
+ torch.cuda.empty_cache()
105
+
106
+ print("\n" + "="*100)
107
+ print("✓ Merge completed successfully!")
108
+ print(f"✓ Merged model saved to: {merged_path}")
109
+ print(f"✓ RoPE scaling config: {merged_config['rope_scaling']}")
110
+ print(f"✓ Max position embeddings: {merged_config['max_position_embeddings']}")
111
+ print("="*100 + "\n")
112
+
113
+ return merged_path
114
+
115
+ def main():
116
+ # Set CUDA device FIRST before any CUDA operations
117
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
118
+ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
119
+
120
+ print("Checking for merged model...")
121
+ merged_model_path = merge_lora_if_needed()
122
+
123
+ # Detect number of GPUs
124
+ num_gpus = torch.cuda.device_count()
125
+ print(f"\n{'='*100}")
126
+ print(f"Detected {num_gpus} GPU(s)")
127
+ if num_gpus > 0:
128
+ for i in range(num_gpus):
129
+ print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
130
+ print(f"{'='*100}\n")
131
+
132
+ # Read the merged model config to get max_model_length
133
+ config_path = os.path.join(merged_model_path, "config.json")
134
+ with open(config_path, 'r') as f:
135
+ model_config_dict = json.load(f)
136
+ max_position_embeddings = model_config_dict.get('max_position_embeddings', 4096)
137
+ rope_scaling = model_config_dict.get('rope_scaling', None)
138
+
139
+ print(f"Model max_position_embeddings: {max_position_embeddings}")
140
+ print(f"Model RoPE scaling config: {rope_scaling}")
141
+
142
+ # 使用 8192 作为 max_model_length(你训练时扩展后的长度)
143
+ max_model_length = 8192
144
+ print(f"Using max_model_length: {max_model_length}\n")
145
+
146
+ print("Setting up evaluation pipeline...")
147
+
148
+ evaluation_tracker = EvaluationTracker(
149
+ output_dir="./results",
150
+ save_details=True,
151
+ push_to_hub=False,
152
+ )
153
+
154
+ pipeline_params = PipelineParameters(
155
+ launcher_type=ParallelismManager.ACCELERATE,
156
+ custom_tasks_directory=None,
157
+ max_samples=500
158
+ )
159
+
160
+ model_config = VLLMModelConfig(
161
+ model_name=merged_model_path,
162
+ dtype="bfloat16",
163
+ max_model_length=max_model_length, # 使用 8192
164
+ trust_remote_code=True,
165
+ tensor_parallel_size=num_gpus,
166
+ )
167
+
168
+ task = "lighteval|math_500|0"
169
+
170
+ print(f"Using {num_gpus} GPU(s) with tensor parallelism")
171
+ print(f"Task: {task}")
172
+ print(f"Max model length: {max_model_length}\n")
173
+
174
+ print("Creating pipeline...")
175
+ pipeline = Pipeline(
176
+ tasks=task,
177
+ pipeline_parameters=pipeline_params,
178
+ evaluation_tracker=evaluation_tracker,
179
+ model_config=model_config,
180
+ )
181
+
182
+ # Fix generation_size
183
+ print("Configuring generation parameters...")
184
+ for task_name, task_obj in pipeline.tasks_dict.items():
185
+ for doc in task_obj._docs:
186
+ doc.generation_size = 2048
187
+
188
+ print("\nStarting evaluation...")
189
+ print("="*100)
190
+ pipeline.evaluate()
191
+
192
+ print("\nSaving results...")
193
+ pipeline.save_and_push_results()
194
+
195
+ print("\nShowing results...")
196
+ pipeline.show_results()
197
+
198
+ print("\n" + "="*100)
199
+ print("✓ Evaluation completed!")
200
+ print("="*100)
201
+
202
+ if __name__ == "__main__":
203
+ main()
train/train_qwen_verl_46k.py ADDED
File without changes
train/train_qwen_verl_46k.sh ADDED
File without changes
train/wandb/run-20251113_165350-n56lk6p0/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2025-11-13T16:53:50.599141389+08:00","level":"INFO","msg":"stream: starting","core version":"0.22.3"}
2
+ {"time":"2025-11-13T16:53:51.898899791+08:00","level":"INFO","msg":"stream: created new stream","id":"n56lk6p0"}
3
+ {"time":"2025-11-13T16:53:51.899200939+08:00","level":"INFO","msg":"handler: started","stream_id":"n56lk6p0"}
4
+ {"time":"2025-11-13T16:53:51.900823773+08:00","level":"INFO","msg":"stream: started","id":"n56lk6p0"}
5
+ {"time":"2025-11-13T16:53:51.90140837+08:00","level":"INFO","msg":"writer: started","stream_id":"n56lk6p0"}
6
+ {"time":"2025-11-13T16:53:51.901489184+08:00","level":"INFO","msg":"sender: started","stream_id":"n56lk6p0"}
train/wandb/run-20251113_171624-kgxigylp/files/wandb-metadata.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
3
+ "python": "CPython 3.10.19",
4
+ "startedAt": "2025-11-13T09:16:24.807116Z",
5
+ "args": [
6
+ "--model_name",
7
+ "Qwen/Qwen2.5-Math-1.5B",
8
+ "--dataset_name",
9
+ "openr1",
10
+ "--output_dir",
11
+ "./model_sft_save/Qwen2.5-Math-1.5B-Full-solution",
12
+ "--batch_size",
13
+ "2",
14
+ "--grad_accum",
15
+ "4",
16
+ "--learning_rate",
17
+ "5e-6",
18
+ "--epochs",
19
+ "1",
20
+ "--use_rope_scaling",
21
+ "--use_deepspeed",
22
+ "--deepspeed_config",
23
+ "deepspeed/dp_stage2.json",
24
+ "--use_wandb",
25
+ "--wandb_project",
26
+ "qwen-math-sft",
27
+ "--wandb_run_name",
28
+ "qwen2.5-1.5b-46k-fft-solution"
29
+ ],
30
+ "program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k.py",
31
+ "codePath": "train_qwen_46k.py",
32
+ "codePathLocal": "train_qwen_46k.py",
33
+ "email": "yaning1001@gmail.com",
34
+ "root": "/public/home/lshi/yoAI/projects/Online_CL/train",
35
+ "host": "gpu-h100-07",
36
+ "executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
37
+ "cpu_count": 96,
38
+ "cpu_count_logical": 96,
39
+ "gpu": "NVIDIA H100 80GB HBM3",
40
+ "gpu_count": 8,
41
+ "disk": {
42
+ "/": {
43
+ "total": "469407801344",
44
+ "used": "289841229824"
45
+ }
46
+ },
47
+ "memory": {
48
+ "total": "2164142350336"
49
+ },
50
+ "gpu_nvidia": [
51
+ {
52
+ "name": "NVIDIA H100 80GB HBM3",
53
+ "memoryTotal": "85520809984",
54
+ "cudaCores": 16896,
55
+ "architecture": "Hopper",
56
+ "uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
57
+ },
58
+ {
59
+ "name": "NVIDIA H100 80GB HBM3",
60
+ "memoryTotal": "85520809984",
61
+ "cudaCores": 16896,
62
+ "architecture": "Hopper",
63
+ "uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
64
+ },
65
+ {
66
+ "name": "NVIDIA H100 80GB HBM3",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper",
70
+ "uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
71
+ },
72
+ {
73
+ "name": "NVIDIA H100 80GB HBM3",
74
+ "memoryTotal": "85520809984",
75
+ "cudaCores": 16896,
76
+ "architecture": "Hopper",
77
+ "uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
78
+ },
79
+ {
80
+ "name": "NVIDIA H100 80GB HBM3",
81
+ "memoryTotal": "85520809984",
82
+ "cudaCores": 16896,
83
+ "architecture": "Hopper",
84
+ "uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
85
+ },
86
+ {
87
+ "name": "NVIDIA H100 80GB HBM3",
88
+ "memoryTotal": "85520809984",
89
+ "cudaCores": 16896,
90
+ "architecture": "Hopper",
91
+ "uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
92
+ },
93
+ {
94
+ "name": "NVIDIA H100 80GB HBM3",
95
+ "memoryTotal": "85520809984",
96
+ "cudaCores": 16896,
97
+ "architecture": "Hopper",
98
+ "uuid": "GPU-23628f74-fede-6431-ae15-2764fce29130"
99
+ },
100
+ {
101
+ "name": "NVIDIA H100 80GB HBM3",
102
+ "memoryTotal": "85520809984",
103
+ "cudaCores": 16896,
104
+ "architecture": "Hopper",
105
+ "uuid": "GPU-d18d570f-dd0f-0ff6-3401-561c9e799136"
106
+ }
107
+ ],
108
+ "cudaVersion": "12.4",
109
+ "slurm": {
110
+ "home": "/opt/gridview/slurm",
111
+ "pmix_direct_conn": "true",
112
+ "pmix_direct_conn_early": "false",
113
+ "pmix_direct_conn_ucx": "false",
114
+ "pmix_timeout": "3000"
115
+ },
116
+ "writerId": "tp3ukl31hffsi4h6hmw62zh3mmvd4ck1"
117
+ }
train/wandb/run-20251114_040305-jb702f8e/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":16984},"_runtime":16984}
train/wandb/run-20251114_083110-mocjk23v/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":1157},"_runtime":1157}
train/wandb/run-20251114_085634-l1whc2fu/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-14T08:56:34.99307817+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmps5yzjy9e/port-683755.txt","pid":683755,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-11-14T08:56:34.993470483+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-683755-684391-740002659/socket","Net":"unix"}}
3
+ {"time":"2025-11-14T08:56:34.993683243+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":683755}
4
+ {"time":"2025-11-14T08:56:35.174882292+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-11-14T08:56:35.188631623+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"l1whc2fu","id":"1(@)"}
6
+ {"time":"2025-11-14T08:56:36.5336818+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"l1whc2fu","id":"1(@)"}
7
+ {"time":"2025-11-14T08:57:28.613569258+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-11-14T08:57:28.613821906+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-11-14T08:57:28.613882852+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2025-11-14T08:57:28.613904195+08:00","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-11-14T08:57:28.613981514+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-683755-684391-740002659/socket","Net":"unix"}}
12
+ {"time":"2025-11-14T08:57:31.434572086+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-11-14T08:57:31.43461066+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-11-14T08:57:31.43463779+08:00","level":"INFO","msg":"server is closed"}
train/wandb/run-20251114_093516-syhj5u87/files/config.yaml ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.22.3
4
+ e:
5
+ 8wbmn2ipfnapymj9hqf701ll90rovymt:
6
+ args:
7
+ - --model_name
8
+ - Qwen/Qwen2.5-1.5B
9
+ - --dataset_path
10
+ - ./datasets/openr1/Openr1-Math-46k-8192.jsonl
11
+ - --output_dir
12
+ - ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
13
+ - --batch_size
14
+ - "2"
15
+ - --grad_accum
16
+ - "4"
17
+ - --learning_rate
18
+ - "5e-6"
19
+ - --epochs
20
+ - "1"
21
+ - --use_entropy_weighting
22
+ - --teacher_model_path
23
+ - deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
24
+ - --entropy_weight_alpha
25
+ - "2.0"
26
+ - --entropy_weight_beta
27
+ - "0.3"
28
+ - --teacher_dtype
29
+ - bfloat16
30
+ - --use_deepspeed
31
+ - --deepspeed_config
32
+ - deepspeed/dp_stage2.json
33
+ - --use_wandb
34
+ - --wandb_project
35
+ - qwen-math-entropy-sft
36
+ - --wandb_run_name
37
+ - qwen2.5-1.5b-46k-entropy-solution
38
+ codePath: train_qwen_46k_weight.py
39
+ codePathLocal: train_qwen_46k_weight.py
40
+ cpu_count: 96
41
+ cpu_count_logical: 96
42
+ cudaVersion: "12.4"
43
+ disk:
44
+ /:
45
+ total: "469407801344"
46
+ used: "288164167680"
47
+ email: yaning1001@gmail.com
48
+ executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
49
+ gpu: NVIDIA H100 80GB HBM3
50
+ gpu_count: 6
51
+ gpu_nvidia:
52
+ - architecture: Hopper
53
+ cudaCores: 16896
54
+ memoryTotal: "85520809984"
55
+ name: NVIDIA H100 80GB HBM3
56
+ uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
57
+ - architecture: Hopper
58
+ cudaCores: 16896
59
+ memoryTotal: "85520809984"
60
+ name: NVIDIA H100 80GB HBM3
61
+ uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
62
+ - architecture: Hopper
63
+ cudaCores: 16896
64
+ memoryTotal: "85520809984"
65
+ name: NVIDIA H100 80GB HBM3
66
+ uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
67
+ - architecture: Hopper
68
+ cudaCores: 16896
69
+ memoryTotal: "85520809984"
70
+ name: NVIDIA H100 80GB HBM3
71
+ uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
72
+ - architecture: Hopper
73
+ cudaCores: 16896
74
+ memoryTotal: "85520809984"
75
+ name: NVIDIA H100 80GB HBM3
76
+ uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
77
+ - architecture: Hopper
78
+ cudaCores: 16896
79
+ memoryTotal: "85520809984"
80
+ name: NVIDIA H100 80GB HBM3
81
+ uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
82
+ host: gpu-h100-07
83
+ memory:
84
+ total: "2164142350336"
85
+ os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
86
+ program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
87
+ python: CPython 3.10.19
88
+ root: /public/home/lshi/yoAI/projects/Online_CL/train
89
+ slurm:
90
+ cluster_name: cluster_admin1
91
+ conf: /opt/gridview/slurm/etc/slurm.conf
92
+ cpu_bind: quiet,mask_cpu:0x000000000000000000000001
93
+ cpu_bind_list: "0x000000000000000000000001"
94
+ cpu_bind_type: 'mask_cpu:'
95
+ cpu_bind_verbose: quiet
96
+ cpus_on_node: "1"
97
+ distribution: cyclic
98
+ gtids: "0"
99
+ home: /opt/gridview/slurm
100
+ job_account: seu_qli
101
+ job_cpus_per_node: "1"
102
+ job_gid: "2026"
103
+ job_id: "8428"
104
+ job_name: bash
105
+ job_nodelist: gpu-h100-07
106
+ job_num_nodes: "1"
107
+ job_partition: H100
108
+ job_qos: normal
109
+ job_uid: "2019"
110
+ job_user: lshi
111
+ jobid: "8428"
112
+ launch_node_ipaddr: 172.16.254.194
113
+ localid: "0"
114
+ nnodes: "1"
115
+ nodeid: "0"
116
+ nodelist: gpu-h100-07
117
+ nprocs: "1"
118
+ ntasks: "1"
119
+ pmix_direct_conn: "true"
120
+ pmix_direct_conn_early: "false"
121
+ pmix_direct_conn_ucx: "false"
122
+ pmix_timeout: "3000"
123
+ prio_process: "0"
124
+ procid: "0"
125
+ pty_port: "43139"
126
+ pty_win_col: "146"
127
+ pty_win_row: "21"
128
+ srun_comm_host: 172.16.254.194
129
+ srun_comm_port: "34989"
130
+ step_gpus: 0,1,2,3,4,5
131
+ step_id: "0"
132
+ step_launcher_port: "34989"
133
+ step_nodelist: gpu-h100-07
134
+ step_num_nodes: "1"
135
+ step_num_tasks: "1"
136
+ step_tasks_per_node: "1"
137
+ stepid: "0"
138
+ submit_dir: /public/home/lshi/yoAI/projects
139
+ submit_host: admin1
140
+ task_pid: "649671"
141
+ tasks_per_node: "1"
142
+ topology_addr: gpu-h100-07
143
+ topology_addr_pattern: node
144
+ umask: "0022"
145
+ working_cluster: cluster_admin1:172.16.254.194:6817:9216:101
146
+ startedAt: "2025-11-14T01:35:16.982120Z"
147
+ writerId: 8wbmn2ipfnapymj9hqf701ll90rovymt
148
+ m: []
149
+ python_version: 3.10.19
150
+ t:
151
+ "1":
152
+ - 1
153
+ - 11
154
+ - 41
155
+ - 49
156
+ - 51
157
+ - 71
158
+ - 84
159
+ - 98
160
+ - 105
161
+ "2":
162
+ - 1
163
+ - 11
164
+ - 41
165
+ - 49
166
+ - 51
167
+ - 71
168
+ - 84
169
+ - 98
170
+ - 105
171
+ "3":
172
+ - 13
173
+ "4": 3.10.19
174
+ "5": 0.22.3
175
+ "6": 4.57.1
176
+ "10":
177
+ - 20
178
+ "12": 0.22.3
179
+ "13": linux-x86_64
train/wandb/run-20251114_093516-syhj5u87/logs/debug-core.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-14T09:35:17.03441142+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpw3beozyg/port-732696.txt","pid":732696,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-11-14T09:35:17.034758287+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-732696-733205-3319755749/socket","Net":"unix"}}
3
+ {"time":"2025-11-14T09:35:17.034951673+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":732696}
4
+ {"time":"2025-11-14T09:35:17.211537499+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-11-14T09:35:17.220353582+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"syhj5u87","id":"1(@)"}
6
+ {"time":"2025-11-14T09:35:18.22989479+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"syhj5u87","id":"1(@)"}
7
+ {"time":"2025-11-14T09:36:02.014981461+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-11-14T09:36:02.015245162+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-11-14T09:36:02.015313878+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2025-11-14T09:36:02.015339332+08:00","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-11-14T09:36:02.015416344+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-732696-733205-3319755749/socket","Net":"unix"}}
12
+ {"time":"2025-11-14T09:36:06.194928818+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
train/wandb/run-20251114_103643-cvm4116u/files/config.yaml ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: Qwen/Qwen2.5-1.5B
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.22.3
6
+ e:
7
+ 7c7ug24k5x5ruwahnf33q520l30fph0h:
8
+ args:
9
+ - --model_name
10
+ - Qwen/Qwen2.5-1.5B
11
+ - --dataset_path
12
+ - ./datasets/openr1/Openr1-Math-46k-8192.jsonl
13
+ - --output_dir
14
+ - ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
15
+ - --batch_size
16
+ - "2"
17
+ - --grad_accum
18
+ - "4"
19
+ - --learning_rate
20
+ - "5e-6"
21
+ - --epochs
22
+ - "1"
23
+ - --use_entropy_weighting
24
+ - --teacher_model_path
25
+ - deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
26
+ - --entropy_weight_alpha
27
+ - "2.0"
28
+ - --entropy_weight_beta
29
+ - "0.3"
30
+ - --teacher_dtype
31
+ - bfloat16
32
+ - --use_deepspeed
33
+ - --deepspeed_config
34
+ - deepspeed/dp_stage2.json
35
+ - --use_wandb
36
+ - --wandb_project
37
+ - qwen-math-entropy-sft
38
+ - --wandb_run_name
39
+ - qwen2.5-1.5b-46k-entropy-solution
40
+ codePath: train_qwen_46k_weight.py
41
+ codePathLocal: train_qwen_46k_weight.py
42
+ cpu_count: 96
43
+ cpu_count_logical: 96
44
+ cudaVersion: "12.4"
45
+ disk:
46
+ /:
47
+ total: "469407801344"
48
+ used: "288221097984"
49
+ email: yaning1001@gmail.com
50
+ executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
51
+ gpu: NVIDIA H100 80GB HBM3
52
+ gpu_count: 6
53
+ gpu_nvidia:
54
+ - architecture: Hopper
55
+ cudaCores: 16896
56
+ memoryTotal: "85520809984"
57
+ name: NVIDIA H100 80GB HBM3
58
+ uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
59
+ - architecture: Hopper
60
+ cudaCores: 16896
61
+ memoryTotal: "85520809984"
62
+ name: NVIDIA H100 80GB HBM3
63
+ uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
64
+ - architecture: Hopper
65
+ cudaCores: 16896
66
+ memoryTotal: "85520809984"
67
+ name: NVIDIA H100 80GB HBM3
68
+ uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
69
+ - architecture: Hopper
70
+ cudaCores: 16896
71
+ memoryTotal: "85520809984"
72
+ name: NVIDIA H100 80GB HBM3
73
+ uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
74
+ - architecture: Hopper
75
+ cudaCores: 16896
76
+ memoryTotal: "85520809984"
77
+ name: NVIDIA H100 80GB HBM3
78
+ uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
79
+ - architecture: Hopper
80
+ cudaCores: 16896
81
+ memoryTotal: "85520809984"
82
+ name: NVIDIA H100 80GB HBM3
83
+ uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
84
+ host: gpu-h100-07
85
+ memory:
86
+ total: "2164142350336"
87
+ os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
88
+ program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
89
+ python: CPython 3.10.19
90
+ root: /public/home/lshi/yoAI/projects/Online_CL/train
91
+ slurm:
92
+ cluster_name: cluster_admin1
93
+ conf: /opt/gridview/slurm/etc/slurm.conf
94
+ cpu_bind: quiet,mask_cpu:0x000000000000000000000001
95
+ cpu_bind_list: "0x000000000000000000000001"
96
+ cpu_bind_type: 'mask_cpu:'
97
+ cpu_bind_verbose: quiet
98
+ cpus_on_node: "1"
99
+ distribution: cyclic
100
+ gtids: "0"
101
+ home: /opt/gridview/slurm
102
+ job_account: seu_qli
103
+ job_cpus_per_node: "1"
104
+ job_gid: "2026"
105
+ job_id: "8428"
106
+ job_name: bash
107
+ job_nodelist: gpu-h100-07
108
+ job_num_nodes: "1"
109
+ job_partition: H100
110
+ job_qos: normal
111
+ job_uid: "2019"
112
+ job_user: lshi
113
+ jobid: "8428"
114
+ launch_node_ipaddr: 172.16.254.194
115
+ localid: "0"
116
+ nnodes: "1"
117
+ nodeid: "0"
118
+ nodelist: gpu-h100-07
119
+ nprocs: "1"
120
+ ntasks: "1"
121
+ pmix_direct_conn: "true"
122
+ pmix_direct_conn_early: "false"
123
+ pmix_direct_conn_ucx: "false"
124
+ pmix_timeout: "3000"
125
+ prio_process: "0"
126
+ procid: "0"
127
+ pty_port: "43139"
128
+ pty_win_col: "146"
129
+ pty_win_row: "21"
130
+ srun_comm_host: 172.16.254.194
131
+ srun_comm_port: "34989"
132
+ step_gpus: 0,1,2,3,4,5
133
+ step_id: "0"
134
+ step_launcher_port: "34989"
135
+ step_nodelist: gpu-h100-07
136
+ step_num_nodes: "1"
137
+ step_num_tasks: "1"
138
+ step_tasks_per_node: "1"
139
+ stepid: "0"
140
+ submit_dir: /public/home/lshi/yoAI/projects
141
+ submit_host: admin1
142
+ task_pid: "649671"
143
+ tasks_per_node: "1"
144
+ topology_addr: gpu-h100-07
145
+ topology_addr_pattern: node
146
+ umask: "0022"
147
+ working_cluster: cluster_admin1:172.16.254.194:6817:9216:101
148
+ startedAt: "2025-11-14T02:36:43.740743Z"
149
+ writerId: 7c7ug24k5x5ruwahnf33q520l30fph0h
150
+ m:
151
+ - "1": train/global_step
152
+ "6":
153
+ - 3
154
+ "7": []
155
+ - "2": '*'
156
+ "5": 1
157
+ "6":
158
+ - 1
159
+ "7": []
160
+ python_version: 3.10.19
161
+ t:
162
+ "1":
163
+ - 1
164
+ - 11
165
+ - 41
166
+ - 49
167
+ - 51
168
+ - 71
169
+ - 84
170
+ - 98
171
+ - 105
172
+ "2":
173
+ - 1
174
+ - 11
175
+ - 41
176
+ - 49
177
+ - 51
178
+ - 71
179
+ - 84
180
+ - 98
181
+ - 105
182
+ "3":
183
+ - 7
184
+ - 13
185
+ - 19
186
+ - 66
187
+ "4": 3.10.19
188
+ "5": 0.22.3
189
+ "6": 4.57.1
190
+ "9":
191
+ "1": transformers_trainer
192
+ "10":
193
+ - 20
194
+ "12": 0.22.3
195
+ "13": linux-x86_64
196
+ accelerator_config:
197
+ value:
198
+ dispatch_batches: null
199
+ even_batches: true
200
+ gradient_accumulation_kwargs: null
201
+ non_blocking: false
202
+ split_batches: false
203
+ use_seedable_sampler: true
204
+ activation_offloading:
205
+ value: false
206
+ adafactor:
207
+ value: false
208
+ adam_beta1:
209
+ value: 0.9
210
+ adam_beta2:
211
+ value: 0.999
212
+ adam_epsilon:
213
+ value: 1e-08
214
+ add_cross_attention:
215
+ value: false
216
+ architectures:
217
+ value:
218
+ - Qwen2ForCausalLM
219
+ assistant_only_loss:
220
+ value: false
221
+ attention_dropout:
222
+ value: 0
223
+ auto_find_batch_size:
224
+ value: false
225
+ average_tokens_across_devices:
226
+ value: true
227
+ bad_words_ids:
228
+ value: null
229
+ batch_eval_metrics:
230
+ value: false
231
+ begin_suppress_tokens:
232
+ value: null
233
+ bf16:
234
+ value: true
235
+ bf16_full_eval:
236
+ value: false
237
+ bos_token_id:
238
+ value: null
239
+ chat_template_path:
240
+ value: null
241
+ chunk_size_feed_forward:
242
+ value: 0
243
+ completion_only_loss:
244
+ value: null
245
+ cross_attention_hidden_size:
246
+ value: null
247
+ data_seed:
248
+ value: null
249
+ dataloader_drop_last:
250
+ value: false
251
+ dataloader_num_workers:
252
+ value: 0
253
+ dataloader_persistent_workers:
254
+ value: false
255
+ dataloader_pin_memory:
256
+ value: true
257
+ dataloader_prefetch_factor:
258
+ value: null
259
+ dataset_kwargs:
260
+ value: null
261
+ dataset_num_proc:
262
+ value: null
263
+ dataset_text_field:
264
+ value: null
265
+ ddp_backend:
266
+ value: null
267
+ ddp_broadcast_buffers:
268
+ value: null
269
+ ddp_bucket_cap_mb:
270
+ value: null
271
+ ddp_find_unused_parameters:
272
+ value: null
273
+ ddp_timeout:
274
+ value: 1800
275
+ debug:
276
+ value: []
277
+ decoder_start_token_id:
278
+ value: null
279
+ deepspeed:
280
+ value: deepspeed/dp_stage2.json
281
+ disable_tqdm:
282
+ value: false
283
+ diversity_penalty:
284
+ value: 0
285
+ do_eval:
286
+ value: true
287
+ do_predict:
288
+ value: false
289
+ do_sample:
290
+ value: false
291
+ do_train:
292
+ value: false
293
+ dtype:
294
+ value: bfloat16
295
+ early_stopping:
296
+ value: false
297
+ encoder_no_repeat_ngram_size:
298
+ value: 0
299
+ entropy_top_k:
300
+ value: 64
301
+ entropy_weight_alpha:
302
+ value: 2
303
+ entropy_weight_beta:
304
+ value: 0.3
305
+ eos_token:
306
+ value: <EOS_TOKEN>
307
+ eos_token_id:
308
+ value: 151643
309
+ eval_accumulation_steps:
310
+ value: null
311
+ eval_delay:
312
+ value: 0
313
+ eval_do_concat_batches:
314
+ value: true
315
+ eval_on_start:
316
+ value: false
317
+ eval_packing:
318
+ value: null
319
+ eval_steps:
320
+ value: 25
321
+ eval_strategy:
322
+ value: steps
323
+ eval_use_gather_object:
324
+ value: false
325
+ exponential_decay_length_penalty:
326
+ value: null
327
+ finetuning_task:
328
+ value: null
329
+ forced_bos_token_id:
330
+ value: null
331
+ forced_eos_token_id:
332
+ value: null
333
+ fp16:
334
+ value: false
335
+ fp16_backend:
336
+ value: auto
337
+ fp16_full_eval:
338
+ value: false
339
+ fp16_opt_level:
340
+ value: O1
341
+ fsdp:
342
+ value: []
343
+ fsdp_config:
344
+ value:
345
+ min_num_params: 0
346
+ xla: false
347
+ xla_fsdp_grad_ckpt: false
348
+ xla_fsdp_v2: false
349
+ fsdp_min_num_params:
350
+ value: 0
351
+ fsdp_transformer_layer_cls_to_wrap:
352
+ value: null
353
+ full_determinism:
354
+ value: false
355
+ gradient_accumulation_steps:
356
+ value: 4
357
+ gradient_checkpointing:
358
+ value: true
359
+ gradient_checkpointing_kwargs:
360
+ value:
361
+ use_reentrant: false
362
+ greater_is_better:
363
+ value: false
364
+ group_by_length:
365
+ value: false
366
+ half_precision_backend:
367
+ value: auto
368
+ hidden_act:
369
+ value: silu
370
+ hidden_size:
371
+ value: 1536
372
+ hub_always_push:
373
+ value: false
374
+ hub_model_id:
375
+ value: null
376
+ hub_private_repo:
377
+ value: null
378
+ hub_revision:
379
+ value: null
380
+ hub_strategy:
381
+ value: every_save
382
+ hub_token:
383
+ value: <HUB_TOKEN>
384
+ id2label:
385
+ value:
386
+ "0": LABEL_0
387
+ "1": LABEL_1
388
+ ignore_data_skip:
389
+ value: false
390
+ include_for_metrics:
391
+ value: []
392
+ include_inputs_for_metrics:
393
+ value: false
394
+ include_num_input_tokens_seen:
395
+ value: "no"
396
+ include_tokens_per_second:
397
+ value: false
398
+ initializer_range:
399
+ value: 0.02
400
+ intermediate_size:
401
+ value: 8960
402
+ is_decoder:
403
+ value: false
404
+ is_encoder_decoder:
405
+ value: false
406
+ jit_mode_eval:
407
+ value: false
408
+ label_names:
409
+ value: null
410
+ label_smoothing_factor:
411
+ value: 0
412
+ label2id:
413
+ value:
414
+ LABEL_0: 0
415
+ LABEL_1: 1
416
+ layer_types:
417
+ value:
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ - full_attention
431
+ - full_attention
432
+ - full_attention
433
+ - full_attention
434
+ - full_attention
435
+ - full_attention
436
+ - full_attention
437
+ - full_attention
438
+ - full_attention
439
+ - full_attention
440
+ - full_attention
441
+ - full_attention
442
+ - full_attention
443
+ - full_attention
444
+ - full_attention
445
+ - full_attention
446
+ learning_rate:
447
+ value: 5e-06
448
+ length_column_name:
449
+ value: length
450
+ length_penalty:
451
+ value: 1
452
+ liger_kernel_config:
453
+ value: null
454
+ load_best_model_at_end:
455
+ value: true
456
+ local_rank:
457
+ value: 0
458
+ log_level:
459
+ value: passive
460
+ log_level_replica:
461
+ value: warning
462
+ log_on_each_node:
463
+ value: true
464
+ logging_dir:
465
+ value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_10-36-48_gpu-h100-07
466
+ logging_first_step:
467
+ value: false
468
+ logging_nan_inf_filter:
469
+ value: true
470
+ logging_steps:
471
+ value: 5
472
+ logging_strategy:
473
+ value: steps
474
+ loss_type:
475
+ value: nll
476
+ lr_scheduler_type:
477
+ value: cosine
478
+ max_grad_norm:
479
+ value: 1
480
+ max_length:
481
+ value: 8192
482
+ max_position_embeddings:
483
+ value: 131072
484
+ max_steps:
485
+ value: -1
486
+ max_window_layers:
487
+ value: 28
488
+ metric_for_best_model:
489
+ value: eval_loss
490
+ min_length:
491
+ value: 0
492
+ model/num_parameters:
493
+ value: 1543298048
494
+ model_init_kwargs:
495
+ value:
496
+ attn_implementation: sdpa
497
+ rope_scaling: null
498
+ torch_dtype: torch.bfloat16
499
+ trust_remote_code: true
500
+ model_type:
501
+ value: qwen2
502
+ mp_parameters:
503
+ value: ""
504
+ neftune_noise_alpha:
505
+ value: null
506
+ no_cuda:
507
+ value: false
508
+ no_repeat_ngram_size:
509
+ value: 0
510
+ num_attention_heads:
511
+ value: 12
512
+ num_beam_groups:
513
+ value: 1
514
+ num_beams:
515
+ value: 1
516
+ num_hidden_layers:
517
+ value: 28
518
+ num_key_value_heads:
519
+ value: 2
520
+ num_return_sequences:
521
+ value: 1
522
+ num_train_epochs:
523
+ value: 1
524
+ optim:
525
+ value: adamw_torch
526
+ optim_args:
527
+ value: null
528
+ optim_target_modules:
529
+ value: null
530
+ output_attentions:
531
+ value: false
532
+ output_dir:
533
+ value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
534
+ output_hidden_states:
535
+ value: false
536
+ output_scores:
537
+ value: false
538
+ overwrite_output_dir:
539
+ value: false
540
+ packing:
541
+ value: false
542
+ packing_strategy:
543
+ value: bfd
544
+ pad_to_multiple_of:
545
+ value: null
546
+ pad_token:
547
+ value: <PAD_TOKEN>
548
+ pad_token_id:
549
+ value: 151643
550
+ padding_free:
551
+ value: false
552
+ parallelism_config:
553
+ value: null
554
+ past_index:
555
+ value: -1
556
+ per_device_eval_batch_size:
557
+ value: 2
558
+ per_device_train_batch_size:
559
+ value: 2
560
+ per_gpu_eval_batch_size:
561
+ value: null
562
+ per_gpu_train_batch_size:
563
+ value: null
564
+ prediction_loss_only:
565
+ value: false
566
+ prefix:
567
+ value: null
568
+ problem_type:
569
+ value: null
570
+ project:
571
+ value: huggingface
572
+ push_to_hub:
573
+ value: false
574
+ push_to_hub_model_id:
575
+ value: null
576
+ push_to_hub_organization:
577
+ value: null
578
+ push_to_hub_token:
579
+ value: <PUSH_TO_HUB_TOKEN>
580
+ ray_scope:
581
+ value: last
582
+ remove_invalid_values:
583
+ value: false
584
+ remove_unused_columns:
585
+ value: true
586
+ repetition_penalty:
587
+ value: 1
588
+ report_to:
589
+ value:
590
+ - wandb
591
+ restore_callback_states_from_checkpoint:
592
+ value: false
593
+ resume_from_checkpoint:
594
+ value: null
595
+ return_dict:
596
+ value: true
597
+ return_dict_in_generate:
598
+ value: false
599
+ rms_norm_eps:
600
+ value: 1e-06
601
+ rope_scaling:
602
+ value: null
603
+ rope_theta:
604
+ value: 1e+06
605
+ run_name:
606
+ value: qwen2.5-1.5b-46k-entropy-solution
607
+ save_on_each_node:
608
+ value: false
609
+ save_only_model:
610
+ value: false
611
+ save_safetensors:
612
+ value: true
613
+ save_steps:
614
+ value: 50
615
+ save_strategy:
616
+ value: steps
617
+ save_total_limit:
618
+ value: 2
619
+ seed:
620
+ value: 42
621
+ sep_token_id:
622
+ value: null
623
+ skip_memory_metrics:
624
+ value: true
625
+ sliding_window:
626
+ value: null
627
+ suppress_tokens:
628
+ value: null
629
+ task_specific_params:
630
+ value: null
631
+ teacher_dtype:
632
+ value: bfloat16
633
+ teacher_model_path:
634
+ value: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
635
+ temperature:
636
+ value: 1
637
+ tf_legacy_loss:
638
+ value: false
639
+ tf32:
640
+ value: null
641
+ tie_encoder_decoder:
642
+ value: false
643
+ tie_word_embeddings:
644
+ value: true
645
+ tokenizer_class:
646
+ value: null
647
+ top_k:
648
+ value: 50
649
+ top_p:
650
+ value: 1
651
+ torch_compile:
652
+ value: false
653
+ torch_compile_backend:
654
+ value: null
655
+ torch_compile_mode:
656
+ value: null
657
+ torch_empty_cache_steps:
658
+ value: null
659
+ torchdynamo:
660
+ value: null
661
+ torchscript:
662
+ value: false
663
+ tpu_metrics_debug:
664
+ value: false
665
+ tpu_num_cores:
666
+ value: null
667
+ trackio_space_id:
668
+ value: trackio
669
+ transformers_version:
670
+ value: 4.57.1
671
+ typical_p:
672
+ value: 1
673
+ use_bfloat16:
674
+ value: false
675
+ use_cache:
676
+ value: true
677
+ use_cpu:
678
+ value: false
679
+ use_entropy_weighting:
680
+ value: true
681
+ use_legacy_prediction_loop:
682
+ value: false
683
+ use_liger_kernel:
684
+ value: false
685
+ use_mps_device:
686
+ value: false
687
+ use_mrope:
688
+ value: false
689
+ use_sliding_window:
690
+ value: false
691
+ vocab_size:
692
+ value: 151665
693
+ warmup_ratio:
694
+ value: 0.03
695
+ warmup_steps:
696
+ value: 0
697
+ weight_decay:
698
+ value: 0.01
train/wandb/run-20251114_103643-cvm4116u/files/output.log ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 📝 Loading tokenizer...
2
+ 📦 Loading local dataset from: ./datasets/openr1/Openr1-Math-46k-8192.jsonl
3
+ 📊 Train: 45334 | Eval: 458
4
+ 🔄 Formatting dataset...
5
+
6
+ 🔍 MASKED PART (prompt/question):
7
+ <|im_start|>system
8
+ Think step by step and solve the problem.<|im_end|>
9
+ <|im_start|>user
10
+ ## Problem Statement
11
+
12
+ Calculate the definite integral:
13
+
14
+ $$
15
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6-\operatorname{tg} x) \sin 2 x}
16
+ $$<|im_end|>
17
+ <|im_start|>assistant
18
+
19
+
20
+ ✅ TRAINED PART (solution):
21
+ <think>
22
+ Okay, so I need to compute the definite integral from π/4 to arccos(1/√26) of dx divided by (6 - tan x) times sin 2x. Hmm, let me start by recalling some integration techniques. The integral has tan x and sin 2x in the denominator. Maybe I can simplify the expression first or use substitution.
23
+
24
+ First, I know that sin 2x is equal to 2 sin x cos x. So, maybe rewriting sin 2x as 2 sin x cos x could help. Let me try that:
25
+
26
+ ∫ [1 / ( (6 - tan x) * 2 sin x cos x ) ] dx
27
+
28
+ Simplifying the denominator, that becomes:
29
+
30
+ 1 / [2 sin x cos x (6 - tan x)] dx
31
+
32
+ But tan x is sin x / cos x, so substituting that in:
33
+
34
+ Denominator: 2 sin x cos x (6 - sin x / cos x )
35
+
36
+ Let me combine the terms in the parenthesis:
37
+
38
+ 6 - sin x / cos x = (6 cos x - sin x)/cos x
39
+
40
+ Therefore, the denominator becomes:
41
+
42
+ 2 sin x cos x * (6 cos x - sin x)/cos x = 2 sin x (6 cos x - sin x)
43
+
44
+ So now, the integral simplifies to:
45
+
46
+ ∫ [1 / (2 sin x (6 cos x - sin x)) ] dx
47
+
48
+ So the integral is now 1/(2 sin x (6 cos x - sin x)) dx. Maybe this is easier to integrate. Let me factor out the 1/2:
49
+
50
+ (1/2) �� [1 / (sin x (6 cos x - sin x)) ] dx
51
+
52
+ Hmm, this seems a bit complicated. Maybe I can use substitution here. Let me think about substitution. Let's let u = 6 cos x - sin x. Then, du/dx = -6 sin x - cos x. Hmm, not sure if that helps. Let's check:
53
+
54
+ If u = 6 cos x - sin x, then du = (-6 sin x - cos x) dx. Hmm, the integral has 1/(sin x u). So, maybe if I can express the integral in terms of du and u. But du has terms -6 sin x - cos x, which isn't directly present in the integral. Maybe this isn't the right substitution.
55
+
56
+ Alternatively, maybe split the fraction into partial fractions. Let me see. The denominator is sin x (6 cos x - sin x). Let me denote t = cos x or t = sin x. Maybe substitution t = sin x or t = cos x.
57
+
58
+ Alternatively, use substitution t = tan x. Let's try that. Let t = tan x. Then, dt/dx = sec²x = 1 + tan²x. So, dx = dt / (1 + t²). Also, sin 2x = 2 tan x / (1 + tan²x) = 2t / (1 + t²). Let me rewrite the integral in terms of t.
59
+
60
+ Original integral:
61
+
62
+ ∫ [1 / ( (6 - t) * (2t / (1 + t²)) ) ] * [dt / (1 + t²)]
63
+
64
+ Wait, let's check. If t = tan x, then when x goes from π/4 to arccos(1/√26), t will go from tan(π/4) = 1 to tan(arccos(1/√26)). Let me compute tan(arccos(1/√26)). Let �� = arccos(1/√26). So cos �� = 1/√26, so sin �� = sqrt(1 - 1/26) = sqrt(25/26) = 5/√26. Therefore, tan �� = sin �� / cos �� = 5. So the upper limit is 5. So substitution t = tan x changes the limits from 1 to 5.
65
+
66
+ Now, let's rewrite the integral. The integrand is 1 / [ (6 - tan x) sin 2x ] dx.
67
+
68
+ Expressing in terms of t:
69
+
70
+ 1 / [ (6 - t) * (2t / (1 + t²)) ] * (dt / (1 + t²))
71
+
72
+ Let me compute that step by step. First, sin 2x is 2t / (1 + t²). So, the denominator becomes (6 - t) * 2t / (1 + t²). Then, dx is dt / (1 + t²). So, multiplying all together:
73
+
74
+ Integral becomes �� [ (1 + t²) / (2t (6 - t)) ] * [ 1 / (1 + t²) ] dt from t=1 to t=5.
75
+
76
+ Simplify: The (1 + t²) cancels out. So, we have �� [1 / (2t (6 - t)) ] dt from 1 to 5.
77
+
78
+ So, the integral simplifies to (1/2) �� [1 / (t (6 - t)) ] dt from 1 to 5.
79
+
80
+ That's a much simpler integral. Now, let's compute �� [1 / (t (6 - t)) ] dt. We can use partial fractions here.
81
+
82
+ Express 1 / [ t (6 - t) ] as A/t + B/(6 - t). Let's find A and B.
83
+
84
+ 1 = A(6 - t) + B t
85
+
86
+ Let t = 0: 1 = 6A => A = 1/6
87
+
88
+ Let t = 6: 1 = 6B => B = 1/6
89
+
90
+ Therefore, 1 / [ t (6 - t) ] = (1/6)/t + (1/6)/(6 - t)
91
+
92
+ Therefore, the integral becomes:
93
+
94
+ (1/2) �� [ (1/6)/t + (1/6)/(6 - t) ] dt from 1 to 5
95
+
96
+ Factor out 1/6:
97
+
98
+ (1/2)(1/6) �� [1/t + 1/(6 - t) ] dt = (1/12) �� [1/t + 1/(6 - t) ] dt
99
+
100
+ Integrate term by term:
101
+
102
+ ∫1/t dt = ln |t| + C
103
+
104
+ ∫1/(6 - t) dt = -ln |6 - t| + C
105
+
106
+ Therefore, the integral becomes:
107
+
108
+ (1/12) [ ln |t| - ln |6 - t| ] evaluated from 1 to 5.
109
+
110
+ So, evaluating from 1 to 5:
111
+
112
+ (1/12) [ (ln 5 - ln (6 - 5)) - (ln 1 - ln (6 - 1)) ]
113
+
114
+ Simplify:
115
+
116
+ First, at upper limit 5: ln 5 - ln 1 = ln 5 (since 6 - 5 = 1, ln 1 = 0)
117
+
118
+ At lower limit 1: ln 1 - ln 5 = 0 - ln 5 = -ln 5
119
+
120
+ Therefore, the expression becomes:
121
+
122
+ (1/12) [ (ln 5 - 0) - (0 - ln 5) ] = (1/12)(ln 5 + ln 5) = (1/12)(2 ln 5) = (1/6) ln 5
123
+
124
+ Therefore, the value of the integral is (1/6) ln 5. Let me check the steps again to ensure I didn't make any mistakes.
125
+
126
+ First substitution: t = tan x. That changed variables correctly. The limits from π/4 to arccos(1/√26) became 1 to 5. Then, substitution led to integral in terms of t, simplified to 1/(2t(6 - t)) dt. Partial fractions correctly done, coefficients A and B were both 1/6. Integrated to (1/12)(ln t - ln(6 - t)). Evaluated from 1 to 5, giving (1/12)[ln5 - 0 - (0 - ln5)] = (1/12)(2 ln5) = (1/6) ln5. Seems correct.
127
+
128
+ Alternatively, let me check with another substitution. Suppose instead of t = tan x, maybe another substitution. Let me see.
129
+
130
+ Original integral after simplifying sin2x:
131
+
132
+ (1/2) �� [1/(sinx (6 cosx - sinx))] dx. Let me try substitution u = 6 cosx - sinx.
133
+
134
+ Compute du/dx = -6 sinx - cosx. Hmm, not directly present in the integrand. But the integrand is 1/(sinx * u). Let me see:
135
+
136
+ If I write the integrand as 1/(sinx * u). If I have du = (-6 sinx - cosx) dx. Let me rearrange terms:
137
+
138
+ du = - sinx (6 + cotx) dx. Hmm, not sure. Alternatively, express the integrand in terms of u and du. Let me see:
139
+
140
+ But this seems more complicated. The previous substitution t = tan x worked smoothly, leading to a standard partial fraction integral. So I think the answer is (1/6) ln5. Therefore, the definite integral is (ln5)/6.
141
+
142
+ **Final Answer**
143
+ \boxed{\dfrac{\ln 5}{6}}
144
+ </think>
145
+
146
+ To calculate the definite integral:
147
+
148
+ $$
149
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6 - \tan x) \sin 2 x}
150
+ $$
151
+
152
+ we start by rewriting \(\sin 2x\) as \(2 \sin x \cos x\). This transforms the integral into:
153
+
154
+ $$
155
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{1}{(6 - \tan x) \cdot 2 \sin x \cos x} \, dx
156
+ $$
157
+
158
+ Simplifying the denominator using \(\tan x = \frac{\sin x}{\cos x}\), we get:
159
+
160
+ $$
161
+ \frac{1}{2 \sin x \cos x \left(6 - \frac{\sin x}{\cos x}\right)} = \frac{1}{2 \sin x (6 \cos x - \sin x)}
162
+ $$
163
+
164
+ Next, we use the substitution \(t = \tan x\), which gives \(dt = \sec^2 x \, dx\) or \(dx = \frac{dt}{1 + t^2}\). The limits of integration change from \(x = \pi/4\) (where \(t = 1\)) to \(x = \arccos(1/\sqrt{26})\) (where \(t = 5\)). Rewriting \(\sin 2x\) as \(\frac{2t}{1 + t^2}\), the integral becomes:
165
+
166
+ $$
167
+ \int_{1}^{5} \frac{1}{(6 - t) \cdot \frac{2t}{1 + t^2}} \cdot \frac{dt}{1 + t^2}
168
+ $$
169
+
170
+ Simplifying, we get:
171
+
172
+ $$
173
+ \frac{1}{2} \int_{1}^{5} \frac{1}{t(6 - t)} \, dt
174
+ $$
175
+
176
+ Using partial fractions, we decompose \(\frac{1}{t(6 - t)}\) into \(\frac{1}{6t} + \frac{1}{6(6 - t)}\). The integral then becomes:
177
+
178
+ $$
179
+ \frac{1}{12} \int_{1}^{5} \left(\frac{1}{t} + \frac{1}{6 - t}\right) \, dt
180
+ $$
181
+
182
+ Integrating term by term, we get:
183
+
184
+ $$
185
+ \frac{1}{12} \left[ \ln |t| - \ln |6 - t| \right]_{1}^{5}
186
+ $$
187
+
188
+ Evaluating this from 1 to 5:
189
+
190
+ $$
191
+ \frac{1}{12} \left[ (\ln 5 - \ln 1) - (\ln 1 - \ln 5) \right] = \frac{1}{12} (2 \ln 5) = \frac{1}{6} \ln 5
192
+ $$
193
+
194
+ Thus, the value of the integral is:
195
+
196
+ $$
197
+ \boxed{\dfrac{\ln 5}{6}}
198
+ ```<|endoftext|>
199
+
200
+ 📊 Stats: 80 masked, 2671 trained
201
+ [2025-11-14 10:36:48,937] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
202
+  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
203
+  [WARNING]  async_io: please install the libaio-devel package with yum
204
+  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
205
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
206
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3
207
+  [WARNING]  using untested triton version (2.3.0), only 1.0.0 is known to be compatible
208
+ [2025-11-14 10:36:49,925] [INFO] [comm.py:637:init_distributed] cdb=None
209
+ [2025-11-14 10:36:49,926] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
210
+
211
+ ============================================================
212
+ 🎓 ENTROPY WEIGHTING ENABLED
213
+ ============================================================
214
+ Teacher Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
215
+ Alpha (α): 2.0
216
+ Beta (β): 0.3
217
+ Teacher dtype: bfloat16
218
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
219
+ ============================================================
220
+ `torch_dtype` is deprecated! Use `dtype` instead!
221
+ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
222
+
223
+ 🎓 Loading teacher model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
224
+ 📊 Entropy weighting params: α=2.0, β=0.3
225
+ 💾 Entropy computation: top-k=64 (memory-efficient mode)
226
+ `torch_dtype` is deprecated! Use `dtype` instead!
227
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.15s/it]
228
+ ✅ Teacher model loaded and frozen
229
+
230
+ 🔍 Checking teacher tokenizer/model alignment...
231
+ 📌 Teacher tokenizer vocab: 151665
232
+ 📌 Teacher model vocab: 152064
233
+ ⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...
234
+ ✅ Teacher embeddings resized to 151665
235
+
236
+ 📊 Student model vocab size: 151936
237
+ 📊 Teacher model vocab size (after alignment): 151665
238
+
239
+ ⚠️ Student/Teacher vocab size mismatch detected!
240
+ Teacher: 151665
241
+ Student: 151936
242
+ 🔧 Resizing student embeddings to match teacher...
243
+ ✅ Student embeddings resized to 151665
244
+ New student vocab size: 151665
245
+
246
+ ============================================================
247
+ 🎯 Final Vocab Alignment Complete
248
+ 📊 Teacher vocab size: 151665
249
+ 📊 Student vocab size: 151665
250
+ ============================================================
251
+
252
+ 🏋️ Starting training...
253
+ 📊 Total training steps: 5666
254
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
255
+ Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
256
+ 1%| | 25/2834 [04:20<8:16:48, 10.61s/it]
257
+ {'loss': 2.5676, 'grad_norm': 10.754595756530762, 'learning_rate': 2.3255813953488374e-07, 'avg_weight': 0.3920933440327644, 'epoch': 0.0}
258
+ {'loss': 2.4953, 'grad_norm': 10.065147399902344, 'learning_rate': 5.232558139534884e-07, 'avg_weight': 0.389410637319088, 'epoch': 0.0}
259
+ {'loss': 2.5079, 'grad_norm': 8.899886131286621, 'learning_rate': 8.139534883720931e-07, 'avg_weight': 0.38864233046770097, 'epoch': 0.01}
260
+ {'loss': 2.4271, 'grad_norm': 8.81152629852295, 'learning_rate': 1.1046511627906977e-06, 'avg_weight': 0.38897048830986025, 'epoch': 0.01}
261
+ {'loss': 2.4171, 'grad_norm': 7.3060808181762695, 'learning_rate': 1.3953488372093025e-06, 'avg_weight': 0.380949105322361, 'epoch': 0.01}
262
+ 33%|███▎ | 38/115 [01:04<02:29, 1.95s/it]
train/wandb/run-20251114_103643-cvm4116u/run-cvm4116u.wandb ADDED
Binary file (98.3 kB). View file
 
train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-metadata.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
3
+ "python": "CPython 3.10.19",
4
+ "startedAt": "2025-11-14T02:36:44.347486Z",
5
+ "args": [
6
+ "--model_name",
7
+ "Qwen/Qwen2.5-1.5B",
8
+ "--dataset_path",
9
+ "./datasets/openr1/Openr1-Math-46k-8192.jsonl",
10
+ "--output_dir",
11
+ "./model_sft_save/Qwen2.5-1.5B-Entropy-solution",
12
+ "--batch_size",
13
+ "2",
14
+ "--grad_accum",
15
+ "4",
16
+ "--learning_rate",
17
+ "5e-6",
18
+ "--epochs",
19
+ "1",
20
+ "--use_entropy_weighting",
21
+ "--teacher_model_path",
22
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
23
+ "--entropy_weight_alpha",
24
+ "2.0",
25
+ "--entropy_weight_beta",
26
+ "0.3",
27
+ "--teacher_dtype",
28
+ "bfloat16",
29
+ "--use_deepspeed",
30
+ "--deepspeed_config",
31
+ "deepspeed/dp_stage2.json",
32
+ "--use_wandb",
33
+ "--wandb_project",
34
+ "qwen-math-entropy-sft",
35
+ "--wandb_run_name",
36
+ "qwen2.5-1.5b-46k-entropy-solution"
37
+ ],
38
+ "program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py",
39
+ "codePath": "train_qwen_46k_weight.py",
40
+ "codePathLocal": "train_qwen_46k_weight.py",
41
+ "email": "yaning1001@gmail.com",
42
+ "root": "/public/home/lshi/yoAI/projects/Online_CL/train",
43
+ "host": "gpu-h100-07",
44
+ "executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
45
+ "cpu_count": 96,
46
+ "cpu_count_logical": 96,
47
+ "gpu": "NVIDIA H100 80GB HBM3",
48
+ "gpu_count": 6,
49
+ "disk": {
50
+ "/": {
51
+ "total": "469407801344",
52
+ "used": "288221097984"
53
+ }
54
+ },
55
+ "memory": {
56
+ "total": "2164142350336"
57
+ },
58
+ "gpu_nvidia": [
59
+ {
60
+ "name": "NVIDIA H100 80GB HBM3",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper",
64
+ "uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
65
+ },
66
+ {
67
+ "name": "NVIDIA H100 80GB HBM3",
68
+ "memoryTotal": "85520809984",
69
+ "cudaCores": 16896,
70
+ "architecture": "Hopper",
71
+ "uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
72
+ },
73
+ {
74
+ "name": "NVIDIA H100 80GB HBM3",
75
+ "memoryTotal": "85520809984",
76
+ "cudaCores": 16896,
77
+ "architecture": "Hopper",
78
+ "uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
79
+ },
80
+ {
81
+ "name": "NVIDIA H100 80GB HBM3",
82
+ "memoryTotal": "85520809984",
83
+ "cudaCores": 16896,
84
+ "architecture": "Hopper",
85
+ "uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
86
+ },
87
+ {
88
+ "name": "NVIDIA H100 80GB HBM3",
89
+ "memoryTotal": "85520809984",
90
+ "cudaCores": 16896,
91
+ "architecture": "Hopper",
92
+ "uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
93
+ },
94
+ {
95
+ "name": "NVIDIA H100 80GB HBM3",
96
+ "memoryTotal": "85520809984",
97
+ "cudaCores": 16896,
98
+ "architecture": "Hopper",
99
+ "uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
100
+ }
101
+ ],
102
+ "cudaVersion": "12.4",
103
+ "slurm": {
104
+ "cluster_name": "cluster_admin1",
105
+ "conf": "/opt/gridview/slurm/etc/slurm.conf",
106
+ "cpu_bind": "quiet,mask_cpu:0x000000000000000000000001",
107
+ "cpu_bind_list": "0x000000000000000000000001",
108
+ "cpu_bind_type": "mask_cpu:",
109
+ "cpu_bind_verbose": "quiet",
110
+ "cpus_on_node": "1",
111
+ "distribution": "cyclic",
112
+ "gtids": "0",
113
+ "home": "/opt/gridview/slurm",
114
+ "job_account": "seu_qli",
115
+ "job_cpus_per_node": "1",
116
+ "job_gid": "2026",
117
+ "job_id": "8428",
118
+ "job_name": "bash",
119
+ "job_nodelist": "gpu-h100-07",
120
+ "job_num_nodes": "1",
121
+ "job_partition": "H100",
122
+ "job_qos": "normal",
123
+ "job_uid": "2019",
124
+ "job_user": "lshi",
125
+ "jobid": "8428",
126
+ "launch_node_ipaddr": "172.16.254.194",
127
+ "localid": "0",
128
+ "nnodes": "1",
129
+ "nodeid": "0",
130
+ "nodelist": "gpu-h100-07",
131
+ "nprocs": "1",
132
+ "ntasks": "1",
133
+ "pmix_direct_conn": "true",
134
+ "pmix_direct_conn_early": "false",
135
+ "pmix_direct_conn_ucx": "false",
136
+ "pmix_timeout": "3000",
137
+ "prio_process": "0",
138
+ "procid": "0",
139
+ "pty_port": "43139",
140
+ "pty_win_col": "146",
141
+ "pty_win_row": "21",
142
+ "srun_comm_host": "172.16.254.194",
143
+ "srun_comm_port": "34989",
144
+ "step_gpus": "0,1,2,3,4,5",
145
+ "step_id": "0",
146
+ "step_launcher_port": "34989",
147
+ "step_nodelist": "gpu-h100-07",
148
+ "step_num_nodes": "1",
149
+ "step_num_tasks": "1",
150
+ "step_tasks_per_node": "1",
151
+ "stepid": "0",
152
+ "submit_dir": "/public/home/lshi/yoAI/projects",
153
+ "submit_host": "admin1",
154
+ "task_pid": "649671",
155
+ "tasks_per_node": "1",
156
+ "topology_addr": "gpu-h100-07",
157
+ "topology_addr_pattern": "node",
158
+ "umask": "0022",
159
+ "working_cluster": "cluster_admin1:172.16.254.194:6817:9216:101"
160
+ },
161
+ "writerId": "by9i30haenoy72kbuz6wpiu3fbxqgmd9"
162
+ }
train/wandb/run-20251114_103644-c9m2ofd0/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":382},"_runtime":382}
train/wandb/run-20251114_145219-w9xre5r3/files/config.yaml ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: Qwen/Qwen2.5-1.5B
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.22.3
6
+ e:
7
+ 4ns8zmo5ar2v4v1bdwhe5waa6417g92a:
8
+ args:
9
+ - --model_name
10
+ - Qwen/Qwen2.5-1.5B
11
+ - --dataset_path
12
+ - ./datasets/openr1/Openr1-Math-46k-8192.jsonl
13
+ - --output_dir
14
+ - ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
15
+ - --batch_size
16
+ - "2"
17
+ - --grad_accum
18
+ - "4"
19
+ - --learning_rate
20
+ - "5e-6"
21
+ - --epochs
22
+ - "1"
23
+ - --use_entropy_weighting
24
+ - --teacher_model_path
25
+ - deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
26
+ - --entropy_weight_alpha
27
+ - "2.0"
28
+ - --entropy_weight_beta
29
+ - "0.3"
30
+ - --teacher_dtype
31
+ - bfloat16
32
+ - --entropy_top_k
33
+ - "48"
34
+ - --teacher_device_ids
35
+ - "2"
36
+ - --use_deepspeed
37
+ - --deepspeed_config
38
+ - deepspeed/dp_stage2.json
39
+ - --use_wandb
40
+ - --wandb_project
41
+ - qwen-math-entropy-sft
42
+ - --wandb_run_name
43
+ - qwen2.5-1.5b-46k-entropy-solution
44
+ codePath: train_qwen_46k_weight.py
45
+ codePathLocal: train_qwen_46k_weight.py
46
+ cpu_count: 96
47
+ cpu_count_logical: 96
48
+ cudaVersion: "12.4"
49
+ disk:
50
+ /:
51
+ total: "469407801344"
52
+ used: "288248733696"
53
+ email: yaning1001@gmail.com
54
+ executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
55
+ gpu: NVIDIA H100 80GB HBM3
56
+ gpu_count: 8
57
+ gpu_nvidia:
58
+ - architecture: Hopper
59
+ cudaCores: 16896
60
+ memoryTotal: "85520809984"
61
+ name: NVIDIA H100 80GB HBM3
62
+ uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
63
+ - architecture: Hopper
64
+ cudaCores: 16896
65
+ memoryTotal: "85520809984"
66
+ name: NVIDIA H100 80GB HBM3
67
+ uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
68
+ - architecture: Hopper
69
+ cudaCores: 16896
70
+ memoryTotal: "85520809984"
71
+ name: NVIDIA H100 80GB HBM3
72
+ uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
73
+ - architecture: Hopper
74
+ cudaCores: 16896
75
+ memoryTotal: "85520809984"
76
+ name: NVIDIA H100 80GB HBM3
77
+ uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
78
+ - architecture: Hopper
79
+ cudaCores: 16896
80
+ memoryTotal: "85520809984"
81
+ name: NVIDIA H100 80GB HBM3
82
+ uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
83
+ - architecture: Hopper
84
+ cudaCores: 16896
85
+ memoryTotal: "85520809984"
86
+ name: NVIDIA H100 80GB HBM3
87
+ uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
88
+ - architecture: Hopper
89
+ cudaCores: 16896
90
+ memoryTotal: "85520809984"
91
+ name: NVIDIA H100 80GB HBM3
92
+ uuid: GPU-23628f74-fede-6431-ae15-2764fce29130
93
+ - architecture: Hopper
94
+ cudaCores: 16896
95
+ memoryTotal: "85520809984"
96
+ name: NVIDIA H100 80GB HBM3
97
+ uuid: GPU-d18d570f-dd0f-0ff6-3401-561c9e799136
98
+ host: gpu-h100-07
99
+ memory:
100
+ total: "2164142350336"
101
+ os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
102
+ program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
103
+ python: CPython 3.10.19
104
+ root: /public/home/lshi/yoAI/projects/Online_CL/train
105
+ slurm:
106
+ home: /opt/gridview/slurm
107
+ pmix_direct_conn: "true"
108
+ pmix_direct_conn_early: "false"
109
+ pmix_direct_conn_ucx: "false"
110
+ pmix_timeout: "3000"
111
+ startedAt: "2025-11-14T06:52:19.650156Z"
112
+ writerId: 4ns8zmo5ar2v4v1bdwhe5waa6417g92a
113
+ m:
114
+ - "1": train/global_step
115
+ "6":
116
+ - 3
117
+ "7": []
118
+ - "2": '*'
119
+ "5": 1
120
+ "6":
121
+ - 1
122
+ "7": []
123
+ python_version: 3.10.19
124
+ t:
125
+ "1":
126
+ - 1
127
+ - 11
128
+ - 41
129
+ - 49
130
+ - 51
131
+ - 71
132
+ - 84
133
+ - 98
134
+ - 105
135
+ "2":
136
+ - 1
137
+ - 11
138
+ - 41
139
+ - 49
140
+ - 51
141
+ - 71
142
+ - 84
143
+ - 98
144
+ - 105
145
+ "3":
146
+ - 7
147
+ - 13
148
+ - 19
149
+ - 66
150
+ "4": 3.10.19
151
+ "5": 0.22.3
152
+ "6": 4.57.1
153
+ "9":
154
+ "1": transformers_trainer
155
+ "10":
156
+ - 20
157
+ "12": 0.22.3
158
+ "13": linux-x86_64
159
+ accelerator_config:
160
+ value:
161
+ dispatch_batches: null
162
+ even_batches: true
163
+ gradient_accumulation_kwargs: null
164
+ non_blocking: false
165
+ split_batches: false
166
+ use_seedable_sampler: true
167
+ activation_offloading:
168
+ value: false
169
+ adafactor:
170
+ value: false
171
+ adam_beta1:
172
+ value: 0.9
173
+ adam_beta2:
174
+ value: 0.999
175
+ adam_epsilon:
176
+ value: 1e-08
177
+ add_cross_attention:
178
+ value: false
179
+ architectures:
180
+ value:
181
+ - Qwen2ForCausalLM
182
+ assistant_only_loss:
183
+ value: false
184
+ attention_dropout:
185
+ value: 0
186
+ auto_find_batch_size:
187
+ value: false
188
+ average_tokens_across_devices:
189
+ value: true
190
+ bad_words_ids:
191
+ value: null
192
+ batch_eval_metrics:
193
+ value: false
194
+ begin_suppress_tokens:
195
+ value: null
196
+ bf16:
197
+ value: true
198
+ bf16_full_eval:
199
+ value: false
200
+ bos_token_id:
201
+ value: null
202
+ chat_template_path:
203
+ value: null
204
+ chunk_size_feed_forward:
205
+ value: 0
206
+ completion_only_loss:
207
+ value: null
208
+ cross_attention_hidden_size:
209
+ value: null
210
+ data_seed:
211
+ value: null
212
+ dataloader_drop_last:
213
+ value: false
214
+ dataloader_num_workers:
215
+ value: 0
216
+ dataloader_persistent_workers:
217
+ value: false
218
+ dataloader_pin_memory:
219
+ value: true
220
+ dataloader_prefetch_factor:
221
+ value: null
222
+ dataset_kwargs:
223
+ value: null
224
+ dataset_num_proc:
225
+ value: null
226
+ dataset_text_field:
227
+ value: null
228
+ ddp_backend:
229
+ value: null
230
+ ddp_broadcast_buffers:
231
+ value: null
232
+ ddp_bucket_cap_mb:
233
+ value: null
234
+ ddp_find_unused_parameters:
235
+ value: null
236
+ ddp_timeout:
237
+ value: 1800
238
+ debug:
239
+ value: []
240
+ decoder_start_token_id:
241
+ value: null
242
+ deepspeed:
243
+ value: deepspeed/dp_stage2.json
244
+ disable_tqdm:
245
+ value: false
246
+ diversity_penalty:
247
+ value: 0
248
+ do_eval:
249
+ value: true
250
+ do_predict:
251
+ value: false
252
+ do_sample:
253
+ value: false
254
+ do_train:
255
+ value: false
256
+ dtype:
257
+ value: bfloat16
258
+ early_stopping:
259
+ value: false
260
+ encoder_no_repeat_ngram_size:
261
+ value: 0
262
+ entropy_top_k:
263
+ value: 48
264
+ entropy_weight_alpha:
265
+ value: 2
266
+ entropy_weight_beta:
267
+ value: 0.3
268
+ eos_token:
269
+ value: <EOS_TOKEN>
270
+ eos_token_id:
271
+ value: 151643
272
+ eval_accumulation_steps:
273
+ value: null
274
+ eval_delay:
275
+ value: 0
276
+ eval_do_concat_batches:
277
+ value: true
278
+ eval_on_start:
279
+ value: false
280
+ eval_packing:
281
+ value: null
282
+ eval_steps:
283
+ value: 25
284
+ eval_strategy:
285
+ value: steps
286
+ eval_use_gather_object:
287
+ value: false
288
+ exponential_decay_length_penalty:
289
+ value: null
290
+ finetuning_task:
291
+ value: null
292
+ forced_bos_token_id:
293
+ value: null
294
+ forced_eos_token_id:
295
+ value: null
296
+ fp16:
297
+ value: false
298
+ fp16_backend:
299
+ value: auto
300
+ fp16_full_eval:
301
+ value: false
302
+ fp16_opt_level:
303
+ value: O1
304
+ fsdp:
305
+ value: []
306
+ fsdp_config:
307
+ value:
308
+ min_num_params: 0
309
+ xla: false
310
+ xla_fsdp_grad_ckpt: false
311
+ xla_fsdp_v2: false
312
+ fsdp_min_num_params:
313
+ value: 0
314
+ fsdp_transformer_layer_cls_to_wrap:
315
+ value: null
316
+ full_determinism:
317
+ value: false
318
+ gradient_accumulation_steps:
319
+ value: 4
320
+ gradient_checkpointing:
321
+ value: true
322
+ gradient_checkpointing_kwargs:
323
+ value:
324
+ use_reentrant: false
325
+ greater_is_better:
326
+ value: false
327
+ group_by_length:
328
+ value: false
329
+ half_precision_backend:
330
+ value: auto
331
+ hidden_act:
332
+ value: silu
333
+ hidden_size:
334
+ value: 1536
335
+ hub_always_push:
336
+ value: false
337
+ hub_model_id:
338
+ value: null
339
+ hub_private_repo:
340
+ value: null
341
+ hub_revision:
342
+ value: null
343
+ hub_strategy:
344
+ value: every_save
345
+ hub_token:
346
+ value: <HUB_TOKEN>
347
+ id2label:
348
+ value:
349
+ "0": LABEL_0
350
+ "1": LABEL_1
351
+ ignore_data_skip:
352
+ value: false
353
+ include_for_metrics:
354
+ value: []
355
+ include_inputs_for_metrics:
356
+ value: false
357
+ include_num_input_tokens_seen:
358
+ value: "no"
359
+ include_tokens_per_second:
360
+ value: false
361
+ initializer_range:
362
+ value: 0.02
363
+ intermediate_size:
364
+ value: 8960
365
+ is_decoder:
366
+ value: false
367
+ is_encoder_decoder:
368
+ value: false
369
+ jit_mode_eval:
370
+ value: false
371
+ label_names:
372
+ value: null
373
+ label_smoothing_factor:
374
+ value: 0
375
+ label2id:
376
+ value:
377
+ LABEL_0: 0
378
+ LABEL_1: 1
379
+ layer_types:
380
+ value:
381
+ - full_attention
382
+ - full_attention
383
+ - full_attention
384
+ - full_attention
385
+ - full_attention
386
+ - full_attention
387
+ - full_attention
388
+ - full_attention
389
+ - full_attention
390
+ - full_attention
391
+ - full_attention
392
+ - full_attention
393
+ - full_attention
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ learning_rate:
410
+ value: 5e-06
411
+ length_column_name:
412
+ value: length
413
+ length_penalty:
414
+ value: 1
415
+ liger_kernel_config:
416
+ value: null
417
+ load_best_model_at_end:
418
+ value: true
419
+ local_rank:
420
+ value: 0
421
+ log_level:
422
+ value: passive
423
+ log_level_replica:
424
+ value: warning
425
+ log_on_each_node:
426
+ value: true
427
+ logging_dir:
428
+ value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_14-52-26_gpu-h100-07
429
+ logging_first_step:
430
+ value: false
431
+ logging_nan_inf_filter:
432
+ value: true
433
+ logging_steps:
434
+ value: 5
435
+ logging_strategy:
436
+ value: steps
437
+ loss_type:
438
+ value: nll
439
+ lr_scheduler_type:
440
+ value: cosine
441
+ max_grad_norm:
442
+ value: 1
443
+ max_length:
444
+ value: 8192
445
+ max_position_embeddings:
446
+ value: 131072
447
+ max_steps:
448
+ value: -1
449
+ max_window_layers:
450
+ value: 28
451
+ metric_for_best_model:
452
+ value: eval_loss
453
+ min_length:
454
+ value: 0
455
+ model/num_parameters:
456
+ value: 1543298048
457
+ model_init_kwargs:
458
+ value: null
459
+ model_type:
460
+ value: qwen2
461
+ mp_parameters:
462
+ value: ""
463
+ neftune_noise_alpha:
464
+ value: null
465
+ no_cuda:
466
+ value: false
467
+ no_repeat_ngram_size:
468
+ value: 0
469
+ num_attention_heads:
470
+ value: 12
471
+ num_beam_groups:
472
+ value: 1
473
+ num_beams:
474
+ value: 1
475
+ num_hidden_layers:
476
+ value: 28
477
+ num_key_value_heads:
478
+ value: 2
479
+ num_return_sequences:
480
+ value: 1
481
+ num_train_epochs:
482
+ value: 1
483
+ optim:
484
+ value: adamw_torch
485
+ optim_args:
486
+ value: null
487
+ optim_target_modules:
488
+ value: null
489
+ output_attentions:
490
+ value: false
491
+ output_dir:
492
+ value: ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
493
+ output_hidden_states:
494
+ value: false
495
+ output_scores:
496
+ value: false
497
+ overwrite_output_dir:
498
+ value: false
499
+ packing:
500
+ value: false
501
+ packing_strategy:
502
+ value: bfd
503
+ pad_to_multiple_of:
504
+ value: null
505
+ pad_token:
506
+ value: <PAD_TOKEN>
507
+ pad_token_id:
508
+ value: 151643
509
+ padding_free:
510
+ value: false
511
+ parallelism_config:
512
+ value: null
513
+ past_index:
514
+ value: -1
515
+ per_device_eval_batch_size:
516
+ value: 2
517
+ per_device_train_batch_size:
518
+ value: 2
519
+ per_gpu_eval_batch_size:
520
+ value: null
521
+ per_gpu_train_batch_size:
522
+ value: null
523
+ prediction_loss_only:
524
+ value: false
525
+ prefix:
526
+ value: null
527
+ problem_type:
528
+ value: null
529
+ project:
530
+ value: huggingface
531
+ push_to_hub:
532
+ value: false
533
+ push_to_hub_model_id:
534
+ value: null
535
+ push_to_hub_organization:
536
+ value: null
537
+ push_to_hub_token:
538
+ value: <PUSH_TO_HUB_TOKEN>
539
+ ray_scope:
540
+ value: last
541
+ remove_invalid_values:
542
+ value: false
543
+ remove_unused_columns:
544
+ value: true
545
+ repetition_penalty:
546
+ value: 1
547
+ report_to:
548
+ value:
549
+ - wandb
550
+ restore_callback_states_from_checkpoint:
551
+ value: false
552
+ resume_from_checkpoint:
553
+ value: null
554
+ return_dict:
555
+ value: true
556
+ return_dict_in_generate:
557
+ value: false
558
+ rms_norm_eps:
559
+ value: 1e-06
560
+ rope_scaling:
561
+ value: null
562
+ rope_theta:
563
+ value: 1e+06
564
+ run_name:
565
+ value: qwen2.5-1.5b-46k-entropy-solution
566
+ save_on_each_node:
567
+ value: false
568
+ save_only_model:
569
+ value: false
570
+ save_safetensors:
571
+ value: true
572
+ save_steps:
573
+ value: 50
574
+ save_strategy:
575
+ value: steps
576
+ save_total_limit:
577
+ value: 2
578
+ seed:
579
+ value: 42
580
+ sep_token_id:
581
+ value: null
582
+ skip_memory_metrics:
583
+ value: true
584
+ sliding_window:
585
+ value: null
586
+ suppress_tokens:
587
+ value: null
588
+ task_specific_params:
589
+ value: null
590
+ teacher_device_ids:
591
+ value: "2"
592
+ teacher_dtype:
593
+ value: bfloat16
594
+ teacher_model_path:
595
+ value: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
596
+ temperature:
597
+ value: 1
598
+ tf_legacy_loss:
599
+ value: false
600
+ tf32:
601
+ value: null
602
+ tie_encoder_decoder:
603
+ value: false
604
+ tie_word_embeddings:
605
+ value: true
606
+ tokenizer_class:
607
+ value: null
608
+ top_k:
609
+ value: 50
610
+ top_p:
611
+ value: 1
612
+ torch_compile:
613
+ value: false
614
+ torch_compile_backend:
615
+ value: null
616
+ torch_compile_mode:
617
+ value: null
618
+ torch_empty_cache_steps:
619
+ value: null
620
+ torchdynamo:
621
+ value: null
622
+ torchscript:
623
+ value: false
624
+ tpu_metrics_debug:
625
+ value: false
626
+ tpu_num_cores:
627
+ value: null
628
+ trackio_space_id:
629
+ value: trackio
630
+ transformers_version:
631
+ value: 4.57.1
632
+ typical_p:
633
+ value: 1
634
+ use_bfloat16:
635
+ value: false
636
+ use_cache:
637
+ value: true
638
+ use_cpu:
639
+ value: false
640
+ use_entropy_weighting:
641
+ value: true
642
+ use_legacy_prediction_loop:
643
+ value: false
644
+ use_liger_kernel:
645
+ value: false
646
+ use_mps_device:
647
+ value: false
648
+ use_mrope:
649
+ value: false
650
+ use_sliding_window:
651
+ value: false
652
+ vocab_size:
653
+ value: 151665
654
+ warmup_ratio:
655
+ value: 0.03
656
+ warmup_steps:
657
+ value: 0
658
+ weight_decay:
659
+ value: 0.01
train/wandb/run-20251114_145219-w9xre5r3/files/output.log ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 📝 Loading tokenizer...
2
+ 📦 Loading local dataset from: ./datasets/openr1/Openr1-Math-46k-8192.jsonl
3
+ 📊 Train: 45334 | Eval: 458
4
+ 🔄 Formatting dataset...
5
+
6
+ 🔍 MASKED PART (prompt/question):
7
+ <|im_start|>system
8
+ Think step by step and solve the problem.<|im_end|>
9
+ <|im_start|>user
10
+ ## Problem Statement
11
+
12
+ Calculate the definite integral:
13
+
14
+ $$
15
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6-\operatorname{tg} x) \sin 2 x}
16
+ $$<|im_end|>
17
+ <|im_start|>assistant
18
+
19
+
20
+ ✅ TRAINED PART (solution):
21
+ <think>
22
+ Okay, so I need to compute the definite integral from π/4 to arccos(1/√26) of dx divided by (6 - tan x) times sin 2x. Hmm, let me start by recalling some integration techniques. The integral has tan x and sin 2x in the denominator. Maybe I can simplify the expression first or use substitution.
23
+
24
+ First, I know that sin 2x is equal to 2 sin x cos x. So, maybe rewriting sin 2x as 2 sin x cos x could help. Let me try that:
25
+
26
+ ∫ [1 / ( (6 - tan x) * 2 sin x cos x ) ] dx
27
+
28
+ Simplifying the denominator, that becomes:
29
+
30
+ 1 / [2 sin x cos x (6 - tan x)] dx
31
+
32
+ But tan x is sin x / cos x, so substituting that in:
33
+
34
+ Denominator: 2 sin x cos x (6 - sin x / cos x )
35
+
36
+ Let me combine the terms in the parenthesis:
37
+
38
+ 6 - sin x / cos x = (6 cos x - sin x)/cos x
39
+
40
+ Therefore, the denominator becomes:
41
+
42
+ 2 sin x cos x * (6 cos x - sin x)/cos x = 2 sin x (6 cos x - sin x)
43
+
44
+ So now, the integral simplifies to:
45
+
46
+ ∫ [1 / (2 sin x (6 cos x - sin x)) ] dx
47
+
48
+ So the integral is now 1/(2 sin x (6 cos x - sin x)) dx. Maybe this is easier to integrate. Let me factor out the 1/2:
49
+
50
+ (1/2) �� [1 / (sin x (6 cos x - sin x)) ] dx
51
+
52
+ Hmm, this seems a bit complicated. Maybe I can use substitution here. Let me think about substitution. Let's let u = 6 cos x - sin x. Then, du/dx = -6 sin x - cos x. Hmm, not sure if that helps. Let's check:
53
+
54
+ If u = 6 cos x - sin x, then du = (-6 sin x - cos x) dx. Hmm, the integral has 1/(sin x u). So, maybe if I can express the integral in terms of du and u. But du has terms -6 sin x - cos x, which isn't directly present in the integral. Maybe this isn't the right substitution.
55
+
56
+ Alternatively, maybe split the fraction into partial fractions. Let me see. The denominator is sin x (6 cos x - sin x). Let me denote t = cos x or t = sin x. Maybe substitution t = sin x or t = cos x.
57
+
58
+ Alternatively, use substitution t = tan x. Let's try that. Let t = tan x. Then, dt/dx = sec²x = 1 + tan²x. So, dx = dt / (1 + t²). Also, sin 2x = 2 tan x / (1 + tan²x) = 2t / (1 + t²). Let me rewrite the integral in terms of t.
59
+
60
+ Original integral:
61
+
62
+ ∫ [1 / ( (6 - t) * (2t / (1 + t²)) ) ] * [dt / (1 + t²)]
63
+
64
+ Wait, let's check. If t = tan x, then when x goes from π/4 to arccos(1/√26), t will go from tan(π/4) = 1 to tan(arccos(1/√26)). Let me compute tan(arccos(1/√26)). Let �� = arccos(1/√26). So cos �� = 1/√26, so sin �� = sqrt(1 - 1/26) = sqrt(25/26) = 5/√26. Therefore, tan �� = sin �� / cos �� = 5. So the upper limit is 5. So substitution t = tan x changes the limits from 1 to 5.
65
+
66
+ Now, let's rewrite the integral. The integrand is 1 / [ (6 - tan x) sin 2x ] dx.
67
+
68
+ Expressing in terms of t:
69
+
70
+ 1 / [ (6 - t) * (2t / (1 + t²)) ] * (dt / (1 + t²))
71
+
72
+ Let me compute that step by step. First, sin 2x is 2t / (1 + t²). So, the denominator becomes (6 - t) * 2t / (1 + t²). Then, dx is dt / (1 + t²). So, multiplying all together:
73
+
74
+ Integral becomes �� [ (1 + t²) / (2t (6 - t)) ] * [ 1 / (1 + t²) ] dt from t=1 to t=5.
75
+
76
+ Simplify: The (1 + t²) cancels out. So, we have �� [1 / (2t (6 - t)) ] dt from 1 to 5.
77
+
78
+ So, the integral simplifies to (1/2) �� [1 / (t (6 - t)) ] dt from 1 to 5.
79
+
80
+ That's a much simpler integral. Now, let's compute �� [1 / (t (6 - t)) ] dt. We can use partial fractions here.
81
+
82
+ Express 1 / [ t (6 - t) ] as A/t + B/(6 - t). Let's find A and B.
83
+
84
+ 1 = A(6 - t) + B t
85
+
86
+ Let t = 0: 1 = 6A => A = 1/6
87
+
88
+ Let t = 6: 1 = 6B => B = 1/6
89
+
90
+ Therefore, 1 / [ t (6 - t) ] = (1/6)/t + (1/6)/(6 - t)
91
+
92
+ Therefore, the integral becomes:
93
+
94
+ (1/2) �� [ (1/6)/t + (1/6)/(6 - t) ] dt from 1 to 5
95
+
96
+ Factor out 1/6:
97
+
98
+ (1/2)(1/6) �� [1/t + 1/(6 - t) ] dt = (1/12) �� [1/t + 1/(6 - t) ] dt
99
+
100
+ Integrate term by term:
101
+
102
+ ∫1/t dt = ln |t| + C
103
+
104
+ ∫1/(6 - t) dt = -ln |6 - t| + C
105
+
106
+ Therefore, the integral becomes:
107
+
108
+ (1/12) [ ln |t| - ln |6 - t| ] evaluated from 1 to 5.
109
+
110
+ So, evaluating from 1 to 5:
111
+
112
+ (1/12) [ (ln 5 - ln (6 - 5)) - (ln 1 - ln (6 - 1)) ]
113
+
114
+ Simplify:
115
+
116
+ First, at upper limit 5: ln 5 - ln 1 = ln 5 (since 6 - 5 = 1, ln 1 = 0)
117
+
118
+ At lower limit 1: ln 1 - ln 5 = 0 - ln 5 = -ln 5
119
+
120
+ Therefore, the expression becomes:
121
+
122
+ (1/12) [ (ln 5 - 0) - (0 - ln 5) ] = (1/12)(ln 5 + ln 5) = (1/12)(2 ln 5) = (1/6) ln 5
123
+
124
+ Therefore, the value of the integral is (1/6) ln 5. Let me check the steps again to ensure I didn't make any mistakes.
125
+
126
+ First substitution: t = tan x. That changed variables correctly. The limits from π/4 to arccos(1/√26) became 1 to 5. Then, substitution led to integral in terms of t, simplified to 1/(2t(6 - t)) dt. Partial fractions correctly done, coefficients A and B were both 1/6. Integrated to (1/12)(ln t - ln(6 - t)). Evaluated from 1 to 5, giving (1/12)[ln5 - 0 - (0 - ln5)] = (1/12)(2 ln5) = (1/6) ln5. Seems correct.
127
+
128
+ Alternatively, let me check with another substitution. Suppose instead of t = tan x, maybe another substitution. Let me see.
129
+
130
+ Original integral after simplifying sin2x:
131
+
132
+ (1/2) �� [1/(sinx (6 cosx - sinx))] dx. Let me try substitution u = 6 cosx - sinx.
133
+
134
+ Compute du/dx = -6 sinx - cosx. Hmm, not directly present in the integrand. But the integrand is 1/(sinx * u). Let me see:
135
+
136
+ If I write the integrand as 1/(sinx * u). If I have du = (-6 sinx - cosx) dx. Let me rearrange terms:
137
+
138
+ du = - sinx (6 + cotx) dx. Hmm, not sure. Alternatively, express the integrand in terms of u and du. Let me see:
139
+
140
+ But this seems more complicated. The previous substitution t = tan x worked smoothly, leading to a standard partial fraction integral. So I think the answer is (1/6) ln5. Therefore, the definite integral is (ln5)/6.
141
+
142
+ **Final Answer**
143
+ \boxed{\dfrac{\ln 5}{6}}
144
+ </think>
145
+
146
+ To calculate the definite integral:
147
+
148
+ $$
149
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{d x}{(6 - \tan x) \sin 2 x}
150
+ $$
151
+
152
+ we start by rewriting \(\sin 2x\) as \(2 \sin x \cos x\). This transforms the integral into:
153
+
154
+ $$
155
+ \int_{\pi / 4}^{\arccos (1 / \sqrt{26})} \frac{1}{(6 - \tan x) \cdot 2 \sin x \cos x} \, dx
156
+ $$
157
+
158
+ Simplifying the denominator using \(\tan x = \frac{\sin x}{\cos x}\), we get:
159
+
160
+ $$
161
+ \frac{1}{2 \sin x \cos x \left(6 - \frac{\sin x}{\cos x}\right)} = \frac{1}{2 \sin x (6 \cos x - \sin x)}
162
+ $$
163
+
164
+ Next, we use the substitution \(t = \tan x\), which gives \(dt = \sec^2 x \, dx\) or \(dx = \frac{dt}{1 + t^2}\). The limits of integration change from \(x = \pi/4\) (where \(t = 1\)) to \(x = \arccos(1/\sqrt{26})\) (where \(t = 5\)). Rewriting \(\sin 2x\) as \(\frac{2t}{1 + t^2}\), the integral becomes:
165
+
166
+ $$
167
+ \int_{1}^{5} \frac{1}{(6 - t) \cdot \frac{2t}{1 + t^2}} \cdot \frac{dt}{1 + t^2}
168
+ $$
169
+
170
+ Simplifying, we get:
171
+
172
+ $$
173
+ \frac{1}{2} \int_{1}^{5} \frac{1}{t(6 - t)} \, dt
174
+ $$
175
+
176
+ Using partial fractions, we decompose \(\frac{1}{t(6 - t)}\) into \(\frac{1}{6t} + \frac{1}{6(6 - t)}\). The integral then becomes:
177
+
178
+ $$
179
+ \frac{1}{12} \int_{1}^{5} \left(\frac{1}{t} + \frac{1}{6 - t}\right) \, dt
180
+ $$
181
+
182
+ Integrating term by term, we get:
183
+
184
+ $$
185
+ \frac{1}{12} \left[ \ln |t| - \ln |6 - t| \right]_{1}^{5}
186
+ $$
187
+
188
+ Evaluating this from 1 to 5:
189
+
190
+ $$
191
+ \frac{1}{12} \left[ (\ln 5 - \ln 1) - (\ln 1 - \ln 5) \right] = \frac{1}{12} (2 \ln 5) = \frac{1}{6} \ln 5
192
+ $$
193
+
194
+ Thus, the value of the integral is:
195
+
196
+ $$
197
+ \boxed{\dfrac{\ln 5}{6}}
198
+ ```<|endoftext|>
199
+
200
+ 📊 Stats: 80 masked, 2671 trained
201
+
202
+ 📥 Loading student model: Qwen/Qwen2.5-1.5B
203
+ `torch_dtype` is deprecated! Use `dtype` instead!
204
+ ✅ Student model loaded: 151936 vocab size
205
+ [2025-11-14 14:52:27,291] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
206
+  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
207
+  [WARNING]  async_io: please install the libaio-devel package with yum
208
+  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
209
+  [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
210
+  [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3
211
+  [WARNING]  using untested triton version (2.3.0), only 1.0.0 is known to be compatible
212
+ [2025-11-14 14:52:28,034] [INFO] [comm.py:637:init_distributed] cdb=None
213
+ [2025-11-14 14:52:28,034] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
214
+
215
+ ============================================================
216
+ 🎓 ENTROPY WEIGHTING ENABLED
217
+ ============================================================
218
+ Teacher Model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
219
+ Teacher Device: 2
220
+ Alpha (α): 2.0
221
+ Beta (β): 0.3
222
+ Top-K: 48
223
+ Teacher dtype: bfloat16
224
+ Formula: w_j = exp(-H_t(j)) * sigmoid(α * (|H_s(j) - H_t(j)| - β))
225
+ ============================================================
226
+ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
227
+
228
+ 🎓 Loading teacher model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
229
+ 📊 Entropy weighting params: α=2.0, β=0.3
230
+ 💾 Entropy computation: top-k=48 (memory-efficient mode)
231
+ 🖥️ Teacher target device(s): cuda:2
232
+ 📡 Loading teacher on single GPU: cuda:2
233
+ Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 47.60it/s]
234
+ ✅ Teacher model loaded and frozen
235
+ ✅ Teacher model loaded and frozen
236
+
237
+ 🔍 Checking teacher tokenizer/model alignment...
238
+ 📌 Teacher tokenizer vocab: 151665
239
+ 📌 Teacher model vocab: 152064
240
+ ⚠️ Teacher tokenizer & model vocab mismatch! Resizing teacher embeddings...
241
+ ✅ Teacher embeddings resized to 151665
242
+
243
+ 📊 Student model vocab size: 151936
244
+ 📊 Teacher model vocab size (after alignment): 151665
245
+
246
+ ⚠️ Student/Teacher vocab size mismatch detected!
247
+ Teacher: 151665
248
+ Student: 151936
249
+ 🔧 Resizing student embeddings to match teacher...
250
+ ✅ Student embeddings resized to 151665
251
+ New student vocab size: 151665
252
+
253
+ ============================================================
254
+ 🎯 Final Vocab Alignment Complete
255
+ 📊 Teacher vocab size: 151665
256
+ 📊 Student vocab size: 151665
257
+ ============================================================
258
+
259
+ 🏋️ Starting training...
260
+ 📊 Total training steps: 5666
261
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
262
+ Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 4. Using DeepSpeed's value.
263
+
264
+ {'loss': 2.5591, 'grad_norm': 10.723748207092285, 'learning_rate': 2.3255813953488374e-07, 'avg_weight': 0.3910901114344597, 'epoch': 0.0}
265
+ {'loss': 2.4874, 'grad_norm': 10.057125091552734, 'learning_rate': 5.232558139534884e-07, 'avg_weight': 0.38843219727277756, 'epoch': 0.0}
266
+ {'loss': 2.4994, 'grad_norm': 8.863786697387695, 'learning_rate': 8.139534883720931e-07, 'avg_weight': 0.38760635405778887, 'epoch': 0.01}
267
+ {'loss': 2.4195, 'grad_norm': 8.823384284973145, 'learning_rate': 1.1046511627906977e-06, 'avg_weight': 0.3880971476435661, 'epoch': 0.01}
268
+ {'loss': 2.4098, 'grad_norm': 7.296765327453613, 'learning_rate': 1.3953488372093025e-06, 'avg_weight': 0.3801196217536926, 'epoch': 0.01}
269
+
270
+ {'eval_loss': 0.5676697492599487, 'eval_runtime': 177.1245, 'eval_samples_per_second': 2.586, 'eval_steps_per_second': 0.649, 'epoch': 0.01}
271
+ {'loss': 2.2295, 'grad_norm': 5.418520450592041, 'learning_rate': 1.686046511627907e-06, 'avg_weight': 0.36370994746685026, 'epoch': 0.01}
272
+ {'loss': 2.1721, 'grad_norm': 4.612456798553467, 'learning_rate': 1.976744186046512e-06, 'avg_weight': 0.35689852982759473, 'epoch': 0.01}
273
+ {'loss': 2.0765, 'grad_norm': 4.605122089385986, 'learning_rate': 2.2674418604651163e-06, 'avg_weight': 0.36093987375497816, 'epoch': 0.01}
274
+ {'loss': 2.0677, 'grad_norm': 4.118704795837402, 'learning_rate': 2.558139534883721e-06, 'avg_weight': 0.3577569782733917, 'epoch': 0.02}
275
+ {'loss': 2.1147, 'grad_norm': 3.7384188175201416, 'learning_rate': 2.848837209302326e-06, 'avg_weight': 0.3483647421002388, 'epoch': 0.02}
276
+ {'eval_loss': 0.48641934990882874, 'eval_runtime': 176.9466, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.02}
277
+ {'loss': 1.9462, 'grad_norm': 4.451317310333252, 'learning_rate': 3.1395348837209307e-06, 'avg_weight': 0.3532564118504524, 'epoch': 0.02}
278
+ {'loss': 1.8641, 'grad_norm': 3.378995180130005, 'learning_rate': 3.430232558139535e-06, 'avg_weight': 0.3521373629570007, 'epoch': 0.02}
279
+ {'loss': 1.878, 'grad_norm': 3.9663772583007812, 'learning_rate': 3.72093023255814e-06, 'avg_weight': 0.3486033886671066, 'epoch': 0.02}
280
+ {'loss': 1.8243, 'grad_norm': 3.9792895317077637, 'learning_rate': 4.011627906976744e-06, 'avg_weight': 0.34984882175922394, 'epoch': 0.02}
281
+ {'loss': 1.8915, 'grad_norm': 3.450552463531494, 'learning_rate': 4.302325581395349e-06, 'avg_weight': 0.34732189774513245, 'epoch': 0.03}
282
+ {'eval_loss': 0.44193005561828613, 'eval_runtime': 176.9698, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.03}
283
+ {'loss': 1.6694, 'grad_norm': 3.710488796234131, 'learning_rate': 4.593023255813954e-06, 'avg_weight': 0.3485285028815269, 'epoch': 0.03}
284
+ {'loss': 1.8175, 'grad_norm': 2.9457366466522217, 'learning_rate': 4.883720930232559e-06, 'avg_weight': 0.34327888786792754, 'epoch': 0.03}
285
+ {'loss': 1.799, 'grad_norm': 3.038442611694336, 'learning_rate': 4.999985296579241e-06, 'avg_weight': 0.3441050469875336, 'epoch': 0.03}
286
+ {'loss': 1.6772, 'grad_norm': 3.322579860687256, 'learning_rate': 4.999895442967599e-06, 'avg_weight': 0.34606429785490034, 'epoch': 0.03}
287
+ {'loss': 1.7592, 'grad_norm': 3.061025381088257, 'learning_rate': 4.9997239072437415e-06, 'avg_weight': 0.3396225184202194, 'epoch': 0.04}
288
+ {'eval_loss': 0.4209730625152588, 'eval_runtime': 176.9333, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.04}
289
+ {'loss': 1.663, 'grad_norm': 3.212209463119507, 'learning_rate': 4.999470695012462e-06, 'avg_weight': 0.3401555925607681, 'epoch': 0.04}
290
+ {'loss': 1.6741, 'grad_norm': 2.88710880279541, 'learning_rate': 4.999135814547269e-06, 'avg_weight': 0.3444334402680397, 'epoch': 0.04}
291
+ {'loss': 1.8002, 'grad_norm': 3.0338072776794434, 'learning_rate': 4.99871927679012e-06, 'avg_weight': 0.3414938300848007, 'epoch': 0.04}
292
+ {'loss': 1.6332, 'grad_norm': 2.6287529468536377, 'learning_rate': 4.998221095351058e-06, 'avg_weight': 0.3433367222547531, 'epoch': 0.04}
293
+ {'loss': 1.6309, 'grad_norm': 3.1718740463256836, 'learning_rate': 4.997641286507766e-06, 'avg_weight': 0.3451731190085411, 'epoch': 0.04}
294
+ {'eval_loss': 0.4092359244823456, 'eval_runtime': 176.9574, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.04}
295
+ {'loss': 1.6804, 'grad_norm': 3.1825239658355713, 'learning_rate': 4.996979869205043e-06, 'avg_weight': 0.3412734940648079, 'epoch': 0.05}
296
+ {'loss': 1.6437, 'grad_norm': 3.1620748043060303, 'learning_rate': 4.996236865054177e-06, 'avg_weight': 0.34043743312358854, 'epoch': 0.05}
297
+ {'loss': 1.6659, 'grad_norm': 3.254448175430298, 'learning_rate': 4.995412298332243e-06, 'avg_weight': 0.3406913295388222, 'epoch': 0.05}
298
+ {'loss': 1.6698, 'grad_norm': 3.2264506816864014, 'learning_rate': 4.994506195981309e-06, 'avg_weight': 0.34027899503707887, 'epoch': 0.05}
299
+ {'loss': 1.5531, 'grad_norm': 3.0454981327056885, 'learning_rate': 4.9935185876075525e-06, 'avg_weight': 0.3385377749800682, 'epoch': 0.05}
300
+ {'eval_loss': 0.4014018177986145, 'eval_runtime': 176.9922, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.05}
301
+ {'loss': 1.5782, 'grad_norm': 2.929764747619629, 'learning_rate': 4.992449505480301e-06, 'avg_weight': 0.34045338034629824, 'epoch': 0.05}
302
+ {'loss': 1.5912, 'grad_norm': 3.0769646167755127, 'learning_rate': 4.991298984530968e-06, 'avg_weight': 0.34139142483472823, 'epoch': 0.06}
303
+ {'loss': 1.6423, 'grad_norm': 2.820838689804077, 'learning_rate': 4.9900670623519185e-06, 'avg_weight': 0.3394546613097191, 'epoch': 0.06}
304
+ {'loss': 1.6039, 'grad_norm': 2.8948822021484375, 'learning_rate': 4.98875377919524e-06, 'avg_weight': 0.33894784599542616, 'epoch': 0.06}
305
+ {'loss': 1.5986, 'grad_norm': 2.98633074760437, 'learning_rate': 4.987359177971422e-06, 'avg_weight': 0.3411692440509796, 'epoch': 0.06}
306
+ {'eval_loss': 0.39528319239616394, 'eval_runtime': 176.9244, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.06}
307
+ {'loss': 1.5965, 'grad_norm': 2.802412986755371, 'learning_rate': 4.985883304247961e-06, 'avg_weight': 0.3383798971772194, 'epoch': 0.06}
308
+ {'loss': 1.5762, 'grad_norm': 3.027007818222046, 'learning_rate': 4.984326206247866e-06, 'avg_weight': 0.33840133994817734, 'epoch': 0.07}
309
+ {'loss': 1.5978, 'grad_norm': 2.9903464317321777, 'learning_rate': 4.982687934848086e-06, 'avg_weight': 0.34024504870176314, 'epoch': 0.07}
310
+ {'loss': 1.6444, 'grad_norm': 2.9462625980377197, 'learning_rate': 4.980968543577849e-06, 'avg_weight': 0.33813936412334444, 'epoch': 0.07}
311
+ {'loss': 1.5458, 'grad_norm': 2.9322121143341064, 'learning_rate': 4.979168088616907e-06, 'avg_weight': 0.3389531776309013, 'epoch': 0.07}
312
+ {'eval_loss': 0.39106011390686035, 'eval_runtime': 176.8862, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.07}
313
+ {'loss': 1.5588, 'grad_norm': 3.813068389892578, 'learning_rate': 4.977286628793707e-06, 'avg_weight': 0.3413571178913116, 'epoch': 0.07}
314
+ {'loss': 1.7041, 'grad_norm': 3.0992965698242188, 'learning_rate': 4.975324225583465e-06, 'avg_weight': 0.3345016598701477, 'epoch': 0.07}
315
+ {'loss': 1.5372, 'grad_norm': 2.912554979324341, 'learning_rate': 4.973280943106158e-06, 'avg_weight': 0.33904497176408765, 'epoch': 0.08}
316
+ {'loss': 1.6412, 'grad_norm': 3.341759443283081, 'learning_rate': 4.971156848124429e-06, 'avg_weight': 0.33656598031520846, 'epoch': 0.08}
317
+ {'loss': 1.6438, 'grad_norm': 2.90909481048584, 'learning_rate': 4.968952010041408e-06, 'avg_weight': 0.3322950556874275, 'epoch': 0.08}
318
+ {'eval_loss': 0.38709789514541626, 'eval_runtime': 176.9433, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.08}
319
+ {'loss': 1.5637, 'grad_norm': 3.0614888668060303, 'learning_rate': 4.96666650089844e-06, 'avg_weight': 0.33701644390821456, 'epoch': 0.08}
320
+ {'loss': 1.4937, 'grad_norm': 3.1825013160705566, 'learning_rate': 4.964300395372733e-06, 'avg_weight': 0.33940560817718507, 'epoch': 0.08}
321
+ {'loss': 1.6914, 'grad_norm': 3.442878007888794, 'learning_rate': 4.961853770774921e-06, 'avg_weight': 0.33430094122886655, 'epoch': 0.08}
322
+ {'loss': 1.5223, 'grad_norm': 3.1587722301483154, 'learning_rate': 4.959326707046532e-06, 'avg_weight': 0.33932786285877226, 'epoch': 0.09}
323
+ {'loss': 1.5314, 'grad_norm': 3.011597156524658, 'learning_rate': 4.956719286757381e-06, 'avg_weight': 0.3337151423096657, 'epoch': 0.09}
324
+ {'eval_loss': 0.38383862376213074, 'eval_runtime': 176.8929, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.09}
325
+ {'loss': 1.6525, 'grad_norm': 3.051734447479248, 'learning_rate': 4.9540315951028695e-06, 'avg_weight': 0.336119769513607, 'epoch': 0.09}
326
+ {'loss': 1.6499, 'grad_norm': 3.2344253063201904, 'learning_rate': 4.951263719901203e-06, 'avg_weight': 0.33603269308805467, 'epoch': 0.09}
327
+ {'loss': 1.6005, 'grad_norm': 2.8890116214752197, 'learning_rate': 4.948415751590521e-06, 'avg_weight': 0.3372333973646164, 'epoch': 0.09}
328
+ {'loss': 1.5957, 'grad_norm': 3.270925998687744, 'learning_rate': 4.945487783225942e-06, 'avg_weight': 0.3381495177745819, 'epoch': 0.1}
329
+ {'loss': 1.509, 'grad_norm': 3.0765645503997803, 'learning_rate': 4.9424799104765245e-06, 'avg_weight': 0.33853849917650225, 'epoch': 0.1}
330
+ {'eval_loss': 0.3811159133911133, 'eval_runtime': 176.9156, 'eval_samples_per_second': 2.589, 'eval_steps_per_second': 0.65, 'epoch': 0.1}
331
+ {'loss': 1.5422, 'grad_norm': 2.94732928276062, 'learning_rate': 4.939392231622136e-06, 'avg_weight': 0.3360708475112915, 'epoch': 0.1}
332
+ {'loss': 1.6281, 'grad_norm': 3.1645963191986084, 'learning_rate': 4.9362248475502515e-06, 'avg_weight': 0.3375927582383156, 'epoch': 0.1}
333
+ {'loss': 1.5221, 'grad_norm': 3.1172220706939697, 'learning_rate': 4.932977861752646e-06, 'avg_weight': 0.3341860607266426, 'epoch': 0.1}
334
+ {'loss': 1.5929, 'grad_norm': 2.895547866821289, 'learning_rate': 4.929651380322019e-06, 'avg_weight': 0.3337728187441826, 'epoch': 0.1}
335
+ {'loss': 1.5591, 'grad_norm': 3.267468214035034, 'learning_rate': 4.9262455119485295e-06, 'avg_weight': 0.3368815451860428, 'epoch': 0.11}
336
+ {'eval_loss': 0.3784671425819397, 'eval_runtime': 176.9466, 'eval_samples_per_second': 2.588, 'eval_steps_per_second': 0.65, 'epoch': 0.11}
train/wandb/run-20251114_145219-w9xre5r3/files/requirements.txt ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mpmath==1.3.0
2
+ typing_extensions==4.15.0
3
+ Jinja2==3.1.6
4
+ multidict==6.7.0
5
+ dill==0.3.8
6
+ charset-normalizer==3.4.4
7
+ yarl==1.22.0
8
+ aiosignal==1.4.0
9
+ accelerate==1.11.0
10
+ omegaconf==2.3.0
11
+ msgpack==1.1.2
12
+ codetiming==1.4.0
13
+ click==8.2.1
14
+ referencing==0.37.0
15
+ opentelemetry-api==1.38.0
16
+ opencensus==0.11.4
17
+ sympy==1.14.0
18
+ nvidia-cuda-runtime-cu12==12.1.105
19
+ nvidia-cusparse-cu12==12.1.0.106
20
+ torch==2.3.0+cu121
21
+ PyYAML==6.0.3
22
+ pyarrow-hotfix==0.7
23
+ exceptiongroup==1.3.0
24
+ multiprocess==0.70.16
25
+ regex==2025.11.3
26
+ markdown-it-py==4.0.0
27
+ py-cpuinfo==9.0.0
28
+ smmap==5.0.2
29
+ sentry-sdk==2.43.0
30
+ rpds-py==0.28.0
31
+ tokenizers==0.22.1
32
+ antlr4-python3-runtime==4.9.3
33
+ pybind11==3.0.1
34
+ Markdown==3.10
35
+ cloudpickle==3.1.2
36
+ pyasn1_modules==0.4.2
37
+ wheel==0.45.1
38
+ urllib3==2.5.0
39
+ tzdata==2025.2
40
+ pyarrow==22.0.0
41
+ certifi==2025.10.5
42
+ typer-slim==0.20.0
43
+ huggingface-hub==0.36.0
44
+ nvidia-ml-py==13.580.82
45
+ pydantic==2.12.4
46
+ deepspeed==0.14.4
47
+ platformdirs==4.5.0
48
+ sentencepiece==0.2.1
49
+ trl==0.25.0
50
+ tensorboard-data-server==0.7.2
51
+ googleapis-common-protos==1.72.0
52
+ hydra-core==1.3.2
53
+ jsonschema-specifications==2025.9.1
54
+ tensordict==0.10.0
55
+ opentelemetry-semantic-conventions==0.59b0
56
+ opentelemetry-exporter-prometheus==0.59b0
57
+ nvidia-cublas-cu12==12.1.3.1
58
+ frozenlist==1.8.0
59
+ sniffio==1.3.1
60
+ packaging==25.0
61
+ h11==0.16.0
62
+ async-timeout==5.0.1
63
+ anyio==4.11.0
64
+ pandas==2.3.3
65
+ httpx==0.28.1
66
+ aiohttp==3.13.2
67
+ typeguard==4.4.4
68
+ Pygments==2.19.2
69
+ docstring_parser==0.17.0
70
+ hjson==3.1.0
71
+ pydantic_core==2.41.5
72
+ ninja==1.13.0
73
+ transformers==4.57.1
74
+ datasets==4.4.1
75
+ zipp==3.23.0
76
+ wrapt==2.0.1
77
+ Werkzeug==3.1.3
78
+ pyvers==0.1.0
79
+ prometheus_client==0.23.1
80
+ grpcio==1.76.0
81
+ cachetools==6.2.2
82
+ smart_open==7.5.0
83
+ rsa==4.9.1
84
+ aiohttp-cors==0.8.1
85
+ opentelemetry-sdk==1.38.0
86
+ nvidia-curand-cu12==10.3.2.106
87
+ fsspec==2024.5.0
88
+ requests==2.32.5
89
+ python-dateutil==2.9.0.post0
90
+ py-spy==0.4.1
91
+ safetensors==0.6.2
92
+ distlib==0.4.0
93
+ psutil==7.1.3
94
+ colorful==0.5.8
95
+ rich==14.2.0
96
+ tyro==0.9.35
97
+ protobuf==6.33.0
98
+ wandb==0.22.3
99
+ pyasn1==0.6.1
100
+ opentelemetry-proto==1.38.0
101
+ torchdata==0.11.0
102
+ pip==25.2
103
+ nvidia-cufft-cu12==11.0.2.54
104
+ nvidia-cuda-nvrtc-cu12==12.1.105
105
+ nvidia-cuda-cupti-cu12==12.1.105
106
+ numpy==1.26.4
107
+ nvidia-cudnn-cu12==8.9.2.26
108
+ nvidia-cusolver-cu12==11.4.5.107
109
+ torchvision==0.18.0+cu121
110
+ torchaudio==2.3.0+cu121
111
+ six==1.17.0
112
+ aiohappyeyeballs==2.6.1
113
+ httpcore==1.0.9
114
+ gitdb==4.0.12
115
+ virtualenv==20.35.4
116
+ orjson==3.11.4
117
+ tensorboard==2.20.0
118
+ google-auth==2.43.0
119
+ verl==0.6.0
120
+ setuptools==80.9.0
121
+ nvidia-nvjitlink-cu12==12.9.86
122
+ nvidia-nccl-cu12==2.20.5
123
+ networkx==3.3
124
+ xxhash==3.6.0
125
+ tqdm==4.67.1
126
+ shellingham==1.5.4
127
+ propcache==0.4.1
128
+ idna==3.11
129
+ hf-xet==1.2.0
130
+ attrs==25.4.0
131
+ shtab==1.7.2
132
+ mdurl==0.1.2
133
+ GitPython==3.1.45
134
+ pylatexenc==2.10
135
+ opencensus-context==0.1.3
136
+ absl-py==2.3.1
137
+ importlib_metadata==8.7.0
138
+ jsonschema==4.25.1
139
+ google-api-core==2.28.1
140
+ ray==2.51.1
141
+ pillow==11.3.0
142
+ nvidia-nvtx-cu12==12.1.105
143
+ MarkupSafe==2.1.5
144
+ filelock==3.19.1
145
+ triton==2.3.0
146
+ pytz==2025.2
147
+ peft==0.12.0
148
+ typing-inspection==0.4.2
149
+ annotated-types==0.7.0
150
+ proto-plus==1.26.1
train/wandb/run-20251114_145219-w9xre5r3/files/wandb-metadata.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28",
3
+ "python": "CPython 3.10.19",
4
+ "startedAt": "2025-11-14T06:52:19.650156Z",
5
+ "args": [
6
+ "--model_name",
7
+ "Qwen/Qwen2.5-1.5B",
8
+ "--dataset_path",
9
+ "./datasets/openr1/Openr1-Math-46k-8192.jsonl",
10
+ "--output_dir",
11
+ "./model_sft_save/Qwen2.5-1.5B-Entropy-solution",
12
+ "--batch_size",
13
+ "2",
14
+ "--grad_accum",
15
+ "4",
16
+ "--learning_rate",
17
+ "5e-6",
18
+ "--epochs",
19
+ "1",
20
+ "--use_entropy_weighting",
21
+ "--teacher_model_path",
22
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
23
+ "--entropy_weight_alpha",
24
+ "2.0",
25
+ "--entropy_weight_beta",
26
+ "0.3",
27
+ "--teacher_dtype",
28
+ "bfloat16",
29
+ "--entropy_top_k",
30
+ "48",
31
+ "--teacher_device_ids",
32
+ "2",
33
+ "--use_deepspeed",
34
+ "--deepspeed_config",
35
+ "deepspeed/dp_stage2.json",
36
+ "--use_wandb",
37
+ "--wandb_project",
38
+ "qwen-math-entropy-sft",
39
+ "--wandb_run_name",
40
+ "qwen2.5-1.5b-46k-entropy-solution"
41
+ ],
42
+ "program": "/public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py",
43
+ "codePath": "train_qwen_46k_weight.py",
44
+ "codePathLocal": "train_qwen_46k_weight.py",
45
+ "email": "yaning1001@gmail.com",
46
+ "root": "/public/home/lshi/yoAI/projects/Online_CL/train",
47
+ "host": "gpu-h100-07",
48
+ "executable": "/public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10",
49
+ "cpu_count": 96,
50
+ "cpu_count_logical": 96,
51
+ "gpu": "NVIDIA H100 80GB HBM3",
52
+ "gpu_count": 8,
53
+ "disk": {
54
+ "/": {
55
+ "total": "469407801344",
56
+ "used": "288248733696"
57
+ }
58
+ },
59
+ "memory": {
60
+ "total": "2164142350336"
61
+ },
62
+ "gpu_nvidia": [
63
+ {
64
+ "name": "NVIDIA H100 80GB HBM3",
65
+ "memoryTotal": "85520809984",
66
+ "cudaCores": 16896,
67
+ "architecture": "Hopper",
68
+ "uuid": "GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89"
69
+ },
70
+ {
71
+ "name": "NVIDIA H100 80GB HBM3",
72
+ "memoryTotal": "85520809984",
73
+ "cudaCores": 16896,
74
+ "architecture": "Hopper",
75
+ "uuid": "GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b"
76
+ },
77
+ {
78
+ "name": "NVIDIA H100 80GB HBM3",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper",
82
+ "uuid": "GPU-0d2164b6-b82a-6774-4914-58672f66b913"
83
+ },
84
+ {
85
+ "name": "NVIDIA H100 80GB HBM3",
86
+ "memoryTotal": "85520809984",
87
+ "cudaCores": 16896,
88
+ "architecture": "Hopper",
89
+ "uuid": "GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd"
90
+ },
91
+ {
92
+ "name": "NVIDIA H100 80GB HBM3",
93
+ "memoryTotal": "85520809984",
94
+ "cudaCores": 16896,
95
+ "architecture": "Hopper",
96
+ "uuid": "GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9"
97
+ },
98
+ {
99
+ "name": "NVIDIA H100 80GB HBM3",
100
+ "memoryTotal": "85520809984",
101
+ "cudaCores": 16896,
102
+ "architecture": "Hopper",
103
+ "uuid": "GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6"
104
+ },
105
+ {
106
+ "name": "NVIDIA H100 80GB HBM3",
107
+ "memoryTotal": "85520809984",
108
+ "cudaCores": 16896,
109
+ "architecture": "Hopper",
110
+ "uuid": "GPU-23628f74-fede-6431-ae15-2764fce29130"
111
+ },
112
+ {
113
+ "name": "NVIDIA H100 80GB HBM3",
114
+ "memoryTotal": "85520809984",
115
+ "cudaCores": 16896,
116
+ "architecture": "Hopper",
117
+ "uuid": "GPU-d18d570f-dd0f-0ff6-3401-561c9e799136"
118
+ }
119
+ ],
120
+ "cudaVersion": "12.4",
121
+ "slurm": {
122
+ "home": "/opt/gridview/slurm",
123
+ "pmix_direct_conn": "true",
124
+ "pmix_direct_conn_early": "false",
125
+ "pmix_direct_conn_ucx": "false",
126
+ "pmix_timeout": "3000"
127
+ },
128
+ "writerId": "4ns8zmo5ar2v4v1bdwhe5waa6417g92a"
129
+ }
train/wandb/run-20251114_145219-w9xre5r3/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/avg_weight":0.3368815451860428,"train/epoch":0.10587612493382742,"eval/samples_per_second":2.588,"eval/steps_per_second":0.65,"_wandb":{"runtime":5180},"train/global_step":300,"train/learning_rate":4.9262455119485295e-06,"train/grad_norm":3.267468214035034,"eval/runtime":176.9466,"_timestamp":1.7631082634970565e+09,"_step":71,"eval/loss":0.3784671425819397,"_runtime":5180,"train/loss":1.5591}
train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-11-14T14:52:19.890537747+08:00","level":"INFO","msg":"stream: starting","core version":"0.22.3"}
2
+ {"time":"2025-11-14T14:52:22.294264931+08:00","level":"INFO","msg":"stream: created new stream","id":"w9xre5r3"}
3
+ {"time":"2025-11-14T14:52:22.294429698+08:00","level":"INFO","msg":"handler: started","stream_id":"w9xre5r3"}
4
+ {"time":"2025-11-14T14:52:22.295381437+08:00","level":"INFO","msg":"stream: started","id":"w9xre5r3"}
5
+ {"time":"2025-11-14T14:52:22.295391392+08:00","level":"INFO","msg":"writer: started","stream_id":"w9xre5r3"}
6
+ {"time":"2025-11-14T14:52:22.295405705+08:00","level":"INFO","msg":"sender: started","stream_id":"w9xre5r3"}
7
+ {"time":"2025-11-14T15:49:09.252958086+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/yaning1001-dartmouth-college/qwen-math-entropy-sft/w9xre5r3/file_stream\": unexpected EOF"}
8
+ {"time":"2025-11-14T16:12:36.32995217+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/yaning1001-dartmouth-college/qwen-math-entropy-sft/w9xre5r3/file_stream\": unexpected EOF"}
9
+ {"time":"2025-11-14T16:18:43.711810258+08:00","level":"INFO","msg":"stream: closing","id":"w9xre5r3"}
train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Current SDK version is 0.22.3
2
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Configure stats pid to 1134562
3
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from /public/home/lshi/.config/wandb/settings
4
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from /public/home/lshi/yoAI/projects/Online_CL/train/wandb/settings
5
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_setup.py:_flush():81] Loading settings from environment variables
6
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:setup_run_log_directory():706] Logging user logs to /public/home/lshi/yoAI/projects/Online_CL/train/wandb/run-20251114_145219-w9xre5r3/logs/debug.log
7
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:setup_run_log_directory():707] Logging internal logs to /public/home/lshi/yoAI/projects/Online_CL/train/wandb/run-20251114_145219-w9xre5r3/logs/debug-internal.log
8
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():833] calling init triggers
9
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():838] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-11-14 14:52:19,676 INFO MainThread:1134562 [wandb_init.py:init():881] starting backend
12
+ 2025-11-14 14:52:19,882 INFO MainThread:1134562 [wandb_init.py:init():884] sending inform_init request
13
+ 2025-11-14 14:52:19,886 INFO MainThread:1134562 [wandb_init.py:init():892] backend started and connected
14
+ 2025-11-14 14:52:19,887 INFO MainThread:1134562 [wandb_init.py:init():962] updated telemetry
15
+ 2025-11-14 14:52:19,888 INFO MainThread:1134562 [wandb_init.py:init():986] communicating run to backend with 90.0 second timeout
16
+ 2025-11-14 14:52:22,838 INFO MainThread:1134562 [wandb_init.py:init():1033] starting run threads in backend
17
+ 2025-11-14 14:52:22,926 INFO MainThread:1134562 [wandb_run.py:_console_start():2506] atexit reg
18
+ 2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2354] redirect: wrap_raw
19
+ 2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2423] Wrapping output streams.
20
+ 2025-11-14 14:52:22,927 INFO MainThread:1134562 [wandb_run.py:_redirect():2446] Redirects installed.
21
+ 2025-11-14 14:52:22,929 INFO MainThread:1134562 [wandb_init.py:init():1073] run started, returning control to user process
22
+ 2025-11-14 14:53:10,780 INFO MainThread:1134562 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 151665, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': True, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': 'bfloat16', 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 8192, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'Qwen/Qwen2.5-1.5B', 'transformers_version': '4.57.1', 'model_type': 'qwen2', 'use_mrope': False, 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False, 'output_dir': './model_sft_save/Qwen2.5-1.5B-Entropy-solution', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './model_sft_save/Qwen2.5-1.5B-Entropy-solution/runs/Nov14_14-52-26_gpu-h100-07', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 50, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'qwen2.5-1.5b-46k-entropy-solution', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': 'deepspeed/dp_stage2.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': False}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'model_init_kwargs': None, 'chat_template_path': None, 'dataset_text_field': None, 'dataset_kwargs': None, 'dataset_num_proc': None, 'eos_token': '<EOS_TOKEN>', 'pad_token': '<PAD_TOKEN>', 'packing': False, 'packing_strategy': 'bfd', 'padding_free': False, 'pad_to_multiple_of': None, 'eval_packing': None, 'completion_only_loss': None, 'assistant_only_loss': False, 'loss_type': 'nll', 'activation_offloading': False, 'teacher_model_path': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', 'entropy_weight_alpha': 2.0, 'entropy_weight_beta': 0.3, 'use_entropy_weighting': True, 'teacher_dtype': 'bfloat16', 'entropy_top_k': 48, 'teacher_device_ids': '2'}
23
+ 2025-11-14 14:53:10,783 INFO MainThread:1134562 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1543298048 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x15291a429570>>
24
+ 2025-11-14 14:53:10,784 INFO MainThread:1134562 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 1543298048 None
25
+ 2025-11-14 16:18:43,711 INFO wandb-AsyncioManager-main:1134562 [service_client.py:_forward_responses():80] Reached EOF.
26
+ 2025-11-14 16:18:43,711 INFO wandb-AsyncioManager-main:1134562 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
train/wandb/run-20251114_145222-i8zbx8vz/files/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.22.3
4
+ e:
5
+ 2utx15k5rol3shdkavsb8ec17gbbtrpd:
6
+ args:
7
+ - --model_name
8
+ - Qwen/Qwen2.5-1.5B
9
+ - --dataset_path
10
+ - ./datasets/openr1/Openr1-Math-46k-8192.jsonl
11
+ - --output_dir
12
+ - ./model_sft_save/Qwen2.5-1.5B-Entropy-solution
13
+ - --batch_size
14
+ - "2"
15
+ - --grad_accum
16
+ - "4"
17
+ - --learning_rate
18
+ - "5e-6"
19
+ - --epochs
20
+ - "1"
21
+ - --use_entropy_weighting
22
+ - --teacher_model_path
23
+ - deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
24
+ - --entropy_weight_alpha
25
+ - "2.0"
26
+ - --entropy_weight_beta
27
+ - "0.3"
28
+ - --teacher_dtype
29
+ - bfloat16
30
+ - --entropy_top_k
31
+ - "48"
32
+ - --teacher_device_ids
33
+ - "2"
34
+ - --use_deepspeed
35
+ - --deepspeed_config
36
+ - deepspeed/dp_stage2.json
37
+ - --use_wandb
38
+ - --wandb_project
39
+ - qwen-math-entropy-sft
40
+ - --wandb_run_name
41
+ - qwen2.5-1.5b-46k-entropy-solution
42
+ codePath: train_qwen_46k_weight.py
43
+ codePathLocal: train_qwen_46k_weight.py
44
+ cpu_count: 96
45
+ cpu_count_logical: 96
46
+ cudaVersion: "12.4"
47
+ disk:
48
+ /:
49
+ total: "469407801344"
50
+ used: "288248737792"
51
+ email: yaning1001@gmail.com
52
+ executable: /public/home/lshi/miniconda3/envs/sft_qwen/bin/python3.10
53
+ gpu: NVIDIA H100 80GB HBM3
54
+ gpu_count: 8
55
+ gpu_nvidia:
56
+ - architecture: Hopper
57
+ cudaCores: 16896
58
+ memoryTotal: "85520809984"
59
+ name: NVIDIA H100 80GB HBM3
60
+ uuid: GPU-d26f2d36-a358-5f8b-e928-f5ef4f73bc89
61
+ - architecture: Hopper
62
+ cudaCores: 16896
63
+ memoryTotal: "85520809984"
64
+ name: NVIDIA H100 80GB HBM3
65
+ uuid: GPU-379104cc-3e15-4b2c-1b78-4ee1f142e16b
66
+ - architecture: Hopper
67
+ cudaCores: 16896
68
+ memoryTotal: "85520809984"
69
+ name: NVIDIA H100 80GB HBM3
70
+ uuid: GPU-0d2164b6-b82a-6774-4914-58672f66b913
71
+ - architecture: Hopper
72
+ cudaCores: 16896
73
+ memoryTotal: "85520809984"
74
+ name: NVIDIA H100 80GB HBM3
75
+ uuid: GPU-4f4398d2-2978-d7cb-7a33-2995e4efdbfd
76
+ - architecture: Hopper
77
+ cudaCores: 16896
78
+ memoryTotal: "85520809984"
79
+ name: NVIDIA H100 80GB HBM3
80
+ uuid: GPU-0b6b2f40-df37-1563-f0cb-727bd3ac0fd9
81
+ - architecture: Hopper
82
+ cudaCores: 16896
83
+ memoryTotal: "85520809984"
84
+ name: NVIDIA H100 80GB HBM3
85
+ uuid: GPU-8574e86d-a1bb-13dd-1843-407b718ebdf6
86
+ - architecture: Hopper
87
+ cudaCores: 16896
88
+ memoryTotal: "85520809984"
89
+ name: NVIDIA H100 80GB HBM3
90
+ uuid: GPU-23628f74-fede-6431-ae15-2764fce29130
91
+ - architecture: Hopper
92
+ cudaCores: 16896
93
+ memoryTotal: "85520809984"
94
+ name: NVIDIA H100 80GB HBM3
95
+ uuid: GPU-d18d570f-dd0f-0ff6-3401-561c9e799136
96
+ host: gpu-h100-07
97
+ memory:
98
+ total: "2164142350336"
99
+ os: Linux-4.18.0-372.9.1.el8.x86_64-x86_64-with-glibc2.28
100
+ program: /public/home/lshi/yoAI/projects/Online_CL/train/train_qwen_46k_weight.py
101
+ python: CPython 3.10.19
102
+ root: /public/home/lshi/yoAI/projects/Online_CL/train
103
+ slurm:
104
+ home: /opt/gridview/slurm
105
+ pmix_direct_conn: "true"
106
+ pmix_direct_conn_early: "false"
107
+ pmix_direct_conn_ucx: "false"
108
+ pmix_timeout: "3000"
109
+ startedAt: "2025-11-14T06:52:22.127959Z"
110
+ writerId: 2utx15k5rol3shdkavsb8ec17gbbtrpd
111
+ m: []
112
+ python_version: 3.10.19
113
+ t:
114
+ "1":
115
+ - 1
116
+ - 11
117
+ - 41
118
+ - 49
119
+ - 51
120
+ - 71
121
+ - 84
122
+ - 98
123
+ - 105
124
+ "2":
125
+ - 1
126
+ - 11
127
+ - 41
128
+ - 49
129
+ - 51
130
+ - 71
131
+ - 84
132
+ - 98
133
+ - 105
134
+ "3":
135
+ - 13
136
+ "4": 3.10.19
137
+ "5": 0.22.3
138
+ "6": 4.57.1
139
+ "10":
140
+ - 20
141
+ "12": 0.22.3
142
+ "13": linux-x86_64