Linksome commited on
Commit
ef8845f
·
verified ·
1 Parent(s): b67bc80

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. A/.ipynb_checkpoints/RUNME-checkpoint.sh +386 -0
  2. A/.ipynb_checkpoints/runA-checkpoint.py +232 -0
  3. A/.ipynb_checkpoints/trainer_log-checkpoint.jsonl +0 -0
  4. A/logs/A/10k_port8006_gpu0_20251224_032906_batch2.log +0 -0
  5. A/logs/A/10k_port8006_gpu0_20251229_035755_batch2.log +0 -0
  6. A/logs/A/10k_port8006_gpu0_20251229_035755_batch2.log.pid +1 -0
  7. A/logs/A/1k_port8002_gpu0_20251229_060558_batch1.log +0 -0
  8. A/logs/A/1k_port8002_gpu0_20251229_060558_batch1.log.pid +1 -0
  9. A/logs/A/2k_port8003_gpu0_20251229_035755_batch1.log +0 -0
  10. A/logs/A/2k_port8003_gpu0_20251229_035755_batch1.log.pid +1 -0
  11. A/logs/A/2k_port8003_gpu0_20251229_060558_batch1.log +0 -0
  12. A/logs/A/3k_port8004_gpu0_20251224_032906_batch1.log +0 -0
  13. A/logs/A/3k_port8004_gpu0_20251229_035755_batch1.log +0 -0
  14. A/logs/A/3k_port8004_gpu0_20251229_035755_batch1.log.pid +1 -0
  15. A/logs/A/3k_port8004_gpu0_20251229_060558_batch1.log +0 -0
  16. A/logs/A/4k_port8005_gpu0_20251229_035755_batch1.log +0 -0
  17. A/logs/A/5k_port8006_gpu0_20251224_032906_batch1.log +0 -0
  18. A/logs/A/5k_port8006_gpu0_20251229_035755_batch1.log +0 -0
  19. A/logs/A/5k_port8006_gpu0_20251229_035755_batch1.log.pid +1 -0
  20. A/logs/A/6k_port8002_gpu0_20251224_032906_batch2.log +0 -0
  21. A/logs/A/6k_port8002_gpu0_20251229_035755_batch2.log +0 -0
  22. A/logs/A/6k_port8002_gpu0_20251229_035755_batch2.log.pid +1 -0
  23. A/logs/A/7k_port8003_gpu0_20251224_032906_batch2.log +135 -0
  24. A/logs/A/7k_port8003_gpu0_20251229_035755_batch2.log +0 -0
  25. A/logs/A/7k_port8003_gpu0_20251229_035755_batch2.log.pid +1 -0
  26. A/logs/A/8k_port8004_gpu0_20251224_032906_batch2.log +0 -0
  27. A/logs/A/8k_port8004_gpu0_20251229_035755_batch2.log +0 -0
  28. A/logs/A/8k_port8004_gpu0_20251229_035755_batch2.log.pid +1 -0
  29. A/logs/A/9k_port8005_gpu0_20251224_032906_batch2.log +0 -0
  30. A/logs/A/9k_port8005_gpu0_20251224_032906_batch2.log.pid +1 -0
  31. A/logs/A/9k_port8005_gpu0_20251229_035755_batch2.log +0 -0
  32. A/logs/A/9k_port8005_gpu0_20251229_035755_batch2.log.pid +1 -0
  33. C/logs/C/1k_port8002_gpu0_20251223_091224_batch1.log +0 -0
  34. C/logs/C/1k_port8002_gpu0_20251223_091224_batch1.log.pid +1 -0
  35. C/logs/C/1k_port8002_gpu0_20251223_141442_batch1.log +0 -0
  36. C/logs/C/1k_port8002_gpu0_20251223_141442_batch1.log.pid +1 -0
  37. C/logs/C/2k_port8003_gpu0_20251223_091224_batch1.log.pid +1 -0
  38. C/logs/C/2k_port8003_gpu0_20251223_141442_batch1.log +0 -0
  39. C/logs/C/2k_port8003_gpu0_20251223_141442_batch1.log.pid +1 -0
  40. C/logs/C/3k_port8004_gpu0_20251223_091224_batch1.log +0 -0
  41. C/logs/C/3k_port8004_gpu0_20251223_091224_batch1.log.pid +1 -0
  42. C/logs/C/3k_port8004_gpu0_20251223_141442_batch1.log +0 -0
  43. C/logs/C/3k_port8004_gpu0_20251223_141442_batch1.log.pid +1 -0
  44. C/logs/C/4k_port8005_gpu0_20251223_091224_batch1.log +0 -0
  45. C/logs/C/4k_port8005_gpu0_20251223_091224_batch1.log.pid +1 -0
  46. C/logs/C/5k_port8006_gpu0_20251223_091224_batch1.log +0 -0
  47. C/logs/C/5k_port8006_gpu0_20251223_091224_batch1.log.pid +1 -0
  48. C/logs/C/6k_port8002_gpu0_20251223_141442_batch2.log +0 -0
  49. C/logs/C/7k_port8003_gpu0_20251223_141442_batch2.log +0 -0
  50. H.yaml +63 -0
A/.ipynb_checkpoints/RUNME-checkpoint.sh ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # -----------------------------
5
+ # User config
6
+ # -----------------------------
7
+ config="A"
8
+ CONFIG_DIR="/workspace/v121rc_exp1/${config}"
9
+
10
+ # YAML generation defaults
11
+ MODEL_NAME_OR_PATH="/workspace/meta-llama/Llama-3.1-8B-Instruct"
12
+ TEMPLATE="llama3"
13
+ FINETUNING_TYPE="lora"
14
+ INFER_BACKEND="huggingface"
15
+ TRUST_REMOTE_CODE="true"
16
+
17
+ # Launch config
18
+ BASE_PORT=8002
19
+ SLEEP_BETWEEN_LAUNCHES_SEC=10
20
+ VRAM_THRESHOLD_PCT=80 # if GPU >= threshold after launch, try next GPU for next ckpt
21
+ BATCH_MIN_MODELS=1 # start eval once at least this many services are up
22
+
23
+ # Eval config (passed to python)
24
+ PYTHON_EVAL="/workspace/v121rc_exp1/A/runA.py"
25
+ EVAL_WORKING_DIR="/workspace/v121rc_exp1/PandaEval12_2/HNO3"
26
+ EVAL_SUBWORD="wo_reasoning"
27
+ FORBIDDEN_SUBWORDS_JSON="[]"
28
+ PARTICULAR=""
29
+ SAVE_DIR="${CONFIG_DIR}"
30
+
31
+ # Always stop services between batches to free VRAM
32
+ STOP_SERVICES_BETWEEN_BATCHES="true"
33
+
34
+ # -----------------------------
35
+ # Setup logging
36
+ # -----------------------------
37
+ LOG_ROOT="${CONFIG_DIR}/logs"
38
+ mkdir -p "${LOG_ROOT}/${config}"
39
+ timestamp=$(date +"%Y%m%d_%H%M%S")
40
+
41
+ # -----------------------------
42
+ # Helpers
43
+ # -----------------------------
44
+ require_cmd() {
45
+ command -v "$1" >/dev/null 2>&1 || { echo "ERROR: missing command: $1" >&2; exit 1; }
46
+ }
47
+ require_cmd nvidia-smi
48
+ require_cmd python
49
+ require_cmd curl
50
+ require_cmd sort
51
+ require_cmd awk
52
+
53
+ num_gpus() {
54
+ nvidia-smi -L | wc -l | awk '{print $1}'
55
+ }
56
+
57
+ gpu_mem_pct() {
58
+ local gpu="$1"
59
+ nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits -i "${gpu}" \
60
+ | awk -F',' '{used=$1; total=$2; if (total==0) {print 100} else {printf("%d\n", (used/total)*100)} }'
61
+ }
62
+
63
+ launch_service () {
64
+ local gpu="$1"
65
+ local api_port="$2"
66
+ local yaml_path="$3"
67
+ local log_file="$4"
68
+ local pid_file="$5"
69
+
70
+ echo "Starting (GPU ${gpu}) port ${api_port} : ${yaml_path}"
71
+ echo "Log: ${log_file}"
72
+
73
+ API_PORT="${api_port}" CUDA_VISIBLE_DEVICES="${gpu}" \
74
+ llamafactory-cli api "${yaml_path}" \
75
+ > "${log_file}" 2>&1 &
76
+
77
+ echo $! > "${pid_file}"
78
+ }
79
+
80
+ wait_for_endpoint () {
81
+ local port="$1"
82
+ local url="http://localhost:${port}/v1/models"
83
+
84
+ for attempt in {1..120}; do
85
+ if curl -sS -m 2 "${url}" >/dev/null 2>&1; then
86
+ echo " ready: ${url}"
87
+ return 0
88
+ fi
89
+ sleep 2
90
+ done
91
+
92
+ echo "ERROR: Endpoint did not become ready: ${url}" >&2
93
+ return 1
94
+ }
95
+
96
+ stop_batch_services () {
97
+ local pidfiles=("$@")
98
+ echo "Stopping batch services: ${#pidfiles[@]} processes"
99
+ for pf in "${pidfiles[@]}"; do
100
+ [[ -f "${pf}" ]] || continue
101
+ pid="$(cat "${pf}" || true)"
102
+ if [[ -n "${pid}" ]] && kill -0 "${pid}" >/dev/null 2>&1; then
103
+ kill "${pid}" || true
104
+ fi
105
+ done
106
+ }
107
+
108
+ # -----------------------------
109
+ # Discover checkpoints
110
+ # -----------------------------
111
+ discover_checkpoints_json () {
112
+ shopt -s nullglob
113
+ local ckpt_dirs=( "${CONFIG_DIR}"/checkpoint-* )
114
+ if (( ${#ckpt_dirs[@]} == 0 )); then
115
+ echo "ERROR: No checkpoint-* folders found under: ${CONFIG_DIR}" >&2
116
+ exit 1
117
+ fi
118
+
119
+ mapfile -t ckpt_dirs < <(printf "%s\n" "${ckpt_dirs[@]}" | sort -V)
120
+
121
+ local ckpts=()
122
+ for ckpt_dir in "${ckpt_dirs[@]}"; do
123
+ local base step
124
+ base="$(basename "${ckpt_dir}")"
125
+ step="${base#checkpoint-}"
126
+ if [[ "${step}" =~ ^[0-9]+$ ]]; then
127
+ ckpts+=( "${step}" )
128
+ fi
129
+ done
130
+
131
+ local json="["
132
+ for i in "${!ckpts[@]}"; do
133
+ (( i>0 )) && json+=", "
134
+ json+="${ckpts[$i]}"
135
+ done
136
+ json+="]"
137
+ echo "${json}"
138
+ }
139
+
140
+ # -----------------------------
141
+ # Compute which checkpoints still need launching (resume-aware)
142
+ # -----------------------------
143
+ compute_needed_checkpoints_json () {
144
+ local all_ckpts_json="$1"
145
+
146
+ python - "${all_ckpts_json}" <<'PY'
147
+ import os, json, sys
148
+
149
+ CONFIG_DIR = os.environ.get("CONFIG_DIR")
150
+ SAVE_DIR = os.environ.get("SAVE_DIR", CONFIG_DIR)
151
+ WORKING_DIR = os.environ.get("EVAL_WORKING_DIR")
152
+ SUBWORD = os.environ.get("EVAL_SUBWORD", "")
153
+ FORBIDDEN = json.loads(os.environ.get("FORBIDDEN_SUBWORDS_JSON", "[]"))
154
+ PARTICULAR = os.environ.get("PARTICULAR", "")
155
+
156
+ all_ckpts = json.loads(sys.argv[1])
157
+
158
+ def should_process(fn: str) -> bool:
159
+ if SUBWORD and SUBWORD not in fn:
160
+ return False
161
+ if any(s in fn for s in FORBIDDEN):
162
+ return False
163
+ if PARTICULAR and PARTICULAR not in fn:
164
+ return False
165
+ return fn.endswith(".json")
166
+
167
+ eval_files = sorted([fn for fn in os.listdir(WORKING_DIR) if should_process(fn)])
168
+ if not eval_files:
169
+ print(json.dumps(all_ckpts))
170
+ raise SystemExit(0)
171
+
172
+ def file_complete_for_ckpt(eval_file: str, ckpt: int) -> bool:
173
+ in_path = os.path.join(WORKING_DIR, eval_file)
174
+ out_path = os.path.join(SAVE_DIR, eval_file.replace(".json", "_results.json"))
175
+ if not os.path.exists(out_path):
176
+ return False
177
+ try:
178
+ with open(in_path, "r") as f:
179
+ in_data = json.load(f)
180
+ with open(out_path, "r") as f:
181
+ out_data = json.load(f)
182
+ except Exception:
183
+ return False
184
+
185
+ if not isinstance(in_data, list) or not isinstance(out_data, list):
186
+ return False
187
+ if len(out_data) != len(in_data):
188
+ return False
189
+
190
+ key = f"step_{ckpt}"
191
+ for e in out_data:
192
+ v = e.get(key) or {}
193
+ out = v.get("output", "")
194
+ if not isinstance(out, str) or out.strip() == "":
195
+ return False
196
+ return True
197
+
198
+ needed = []
199
+ for ckpt in all_ckpts:
200
+ done_everywhere = True
201
+ for ef in eval_files:
202
+ if not file_complete_for_ckpt(ef, ckpt):
203
+ done_everywhere = False
204
+ break
205
+ if not done_everywhere:
206
+ needed.append(ckpt)
207
+
208
+ print(json.dumps(needed))
209
+ PY
210
+ }
211
+
212
+ # -----------------------------
213
+ # Generate YAML for one checkpoint
214
+ # -----------------------------
215
+ write_yaml_for_ckpt () {
216
+ local step="$1"
217
+
218
+ python - "${step}" <<'PY'
219
+ import os, sys
220
+ step = int(sys.argv[1])
221
+
222
+ CONFIG_DIR = os.environ["CONFIG_DIR"]
223
+ MODEL = os.environ["MODEL_NAME_OR_PATH"]
224
+ TEMPLATE = os.environ["TEMPLATE"]
225
+ FINETUNING_TYPE = os.environ["FINETUNING_TYPE"]
226
+ INFER_BACKEND = os.environ["INFER_BACKEND"]
227
+ TRUST_REMOTE_CODE = os.environ["TRUST_REMOTE_CODE"]
228
+
229
+ ckpt_dir = os.path.join(CONFIG_DIR, f"checkpoint-{step}")
230
+ if not os.path.isdir(ckpt_dir):
231
+ raise SystemExit(f"Missing checkpoint dir: {ckpt_dir}")
232
+
233
+ name = f"{step//1000}k" if step % 1000 == 0 else str(step)
234
+ yaml_path = os.path.join(CONFIG_DIR, f"{name}.yaml")
235
+
236
+ with open(yaml_path, "w") as f:
237
+ f.write(
238
+ f"model_name_or_path: {MODEL}\n"
239
+ f"adapter_name_or_path: {ckpt_dir}\n"
240
+ f"template: {TEMPLATE}\n"
241
+ f"finetuning_type: {FINETUNING_TYPE}\n"
242
+ f"infer_backend: {INFER_BACKEND}\n"
243
+ f"trust_remote_code: {TRUST_REMOTE_CODE}\n"
244
+ )
245
+ print(yaml_path)
246
+ PY
247
+ }
248
+
249
+ # -----------------------------
250
+ # Main (batch loop)
251
+ # -----------------------------
252
+ export CONFIG_DIR
253
+ export SAVE_DIR
254
+ export EVAL_WORKING_DIR
255
+ export EVAL_SUBWORD
256
+ export FORBIDDEN_SUBWORDS_JSON
257
+ export PARTICULAR
258
+
259
+ export MODEL_NAME_OR_PATH
260
+ export TEMPLATE
261
+ export FINETUNING_TYPE
262
+ export INFER_BACKEND
263
+ export TRUST_REMOTE_CODE
264
+
265
+ ALL_CKPTS_JSON="$(discover_checkpoints_json)"
266
+ GPU_COUNT="$(num_gpus)"
267
+ echo "Detected GPUs: ${GPU_COUNT}"
268
+ echo "All checkpoints found: ${ALL_CKPTS_JSON}"
269
+
270
+ batch_idx=0
271
+
272
+ while true; do
273
+ NEEDED_CKPTS_JSON="$(compute_needed_checkpoints_json "${ALL_CKPTS_JSON}")"
274
+ echo "Still needed checkpoints: ${NEEDED_CKPTS_JSON}"
275
+
276
+ if [[ "${NEEDED_CKPTS_JSON}" == "[]" ]]; then
277
+ echo "All checkpoints complete across outputs. Done."
278
+ exit 0
279
+ fi
280
+
281
+ batch_idx=$((batch_idx + 1))
282
+ echo "=============================="
283
+ echo "Batch ${batch_idx}: launching what fits under VRAM threshold (${VRAM_THRESHOLD_PCT}%)"
284
+ echo "=============================="
285
+
286
+ # Parse needed list into bash array
287
+ mapfile -t NEEDED_LIST < <(python - "${NEEDED_CKPTS_JSON}" <<'PY'
288
+ import json, sys
289
+ for x in json.loads(sys.argv[1]):
290
+ print(int(x))
291
+ PY
292
+ )
293
+
294
+ MODELS_JSON="{"
295
+ first=1
296
+ launched=0
297
+
298
+ # track launched service pidfiles to stop after batch
299
+ batch_pidfiles=()
300
+
301
+ port="${BASE_PORT}"
302
+ gpu=0
303
+
304
+ for ckpt in "${NEEDED_LIST[@]}"; do
305
+ # Find a GPU with headroom; if none, stop launching more in this batch.
306
+ found_gpu="false"
307
+ for ((try=0; try<GPU_COUNT; try++)); do
308
+ pct="$(gpu_mem_pct "${gpu}")"
309
+ if (( pct < VRAM_THRESHOLD_PCT )); then
310
+ found_gpu="true"
311
+ break
312
+ fi
313
+ gpu=$((gpu + 1))
314
+ if (( gpu >= GPU_COUNT )); then gpu=0; fi
315
+ done
316
+
317
+ if [[ "${found_gpu}" != "true" ]]; then
318
+ echo "No GPU under ${VRAM_THRESHOLD_PCT}% VRAM. Stop launching; start eval with current batch."
319
+ break
320
+ fi
321
+
322
+ yaml_path="$(write_yaml_for_ckpt "${ckpt}")"
323
+ tag="$(basename "${yaml_path}" .yaml)"
324
+ log_file="${LOG_ROOT}/${config}/${tag}_port${port}_gpu${gpu}_${timestamp}_batch${batch_idx}.log"
325
+ pid_file="${log_file}.pid"
326
+
327
+ launch_service "${gpu}" "${port}" "${yaml_path}" "${log_file}" "${pid_file}"
328
+ batch_pidfiles+=( "${pid_file}" )
329
+
330
+ if ! wait_for_endpoint "${port}"; then
331
+ echo "Endpoint failed on port ${port}; stopping batch and exiting."
332
+ stop_batch_services "${batch_pidfiles[@]}"
333
+ exit 1
334
+ fi
335
+
336
+ url="http://localhost:${port}/v1/chat/completions"
337
+ if (( first == 1 )); then
338
+ MODELS_JSON+="\"${url}\": ${ckpt}"
339
+ first=0
340
+ else
341
+ MODELS_JSON+=", \"${url}\": ${ckpt}"
342
+ fi
343
+
344
+ launched=$((launched + 1))
345
+
346
+ pct_after="$(gpu_mem_pct "${gpu}")"
347
+ echo "GPU ${gpu} VRAM after launch: ${pct_after}%"
348
+ if (( pct_after >= VRAM_THRESHOLD_PCT )); then
349
+ gpu=$((gpu + 1))
350
+ if (( gpu >= GPU_COUNT )); then gpu=0; fi
351
+ fi
352
+
353
+ port=$((port + 1))
354
+ echo "Sleeping ${SLEEP_BETWEEN_LAUNCHES_SEC}s to avoid VRAM spikes..."
355
+ sleep "${SLEEP_BETWEEN_LAUNCHES_SEC}"
356
+ done
357
+
358
+ MODELS_JSON+="}"
359
+ echo "Launched models in batch ${batch_idx}: ${launched}"
360
+ echo "MODELS_JSON=${MODELS_JSON}"
361
+
362
+ if (( launched < BATCH_MIN_MODELS )); then
363
+ echo "ERROR: Could not launch even ${BATCH_MIN_MODELS} model(s) under VRAM threshold."
364
+ echo "Either increase VRAM_THRESHOLD_PCT, reduce model size, or free VRAM."
365
+ exit 1
366
+ fi
367
+
368
+ # Run eval for this batch
369
+ export MODELS_JSON
370
+ export CKPTS_JSON="[]" # unused when MODELS_JSON exists, but keep it defined
371
+ export BASE_PORT="${BASE_PORT}"
372
+
373
+ echo "Running eval for batch ${batch_idx}: python ${PYTHON_EVAL}"
374
+ python "${PYTHON_EVAL}"
375
+
376
+ # Stop services to free VRAM for next batch
377
+ if [[ "${STOP_SERVICES_BETWEEN_BATCHES}" == "true" ]]; then
378
+ stop_batch_services "${batch_pidfiles[@]}"
379
+ echo "Batch ${batch_idx} services stopped."
380
+ # give GPU a moment to release memory
381
+ sleep 5
382
+ else
383
+ echo "Leaving batch services running (not recommended for batch mode)."
384
+ echo "This may prevent future batches from launching due to VRAM saturation."
385
+ fi
386
+ done
A/.ipynb_checkpoints/runA-checkpoint.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import hashlib
4
+ from typing import Any, Dict, Tuple, List
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ from tqdm import tqdm
8
+ import requests
9
+ from loguru import logger
10
+
11
+
12
+ def getenv_str(key: str, default: str) -> str:
13
+ v = os.environ.get(key)
14
+ return default if v is None else v
15
+
16
+
17
+ def getenv_int(key: str, default: int) -> int:
18
+ v = os.environ.get(key)
19
+ if v is None or v.strip() == "":
20
+ return default
21
+ try:
22
+ return int(v)
23
+ except ValueError:
24
+ raise ValueError(f"Env var {key} must be int, got: {v!r}")
25
+
26
+
27
+ # ----------------------------
28
+ # Read config from environment
29
+ # ----------------------------
30
+ CONFIG_DIR = getenv_str("CONFIG_DIR", "/workspace/v121rc_exp1/A")
31
+ SAVE_DIR = getenv_str("SAVE_DIR", CONFIG_DIR)
32
+
33
+ WORKING_DIR = getenv_str("EVAL_WORKING_DIR", "/workspace/v121rc_exp1/EVAL/HNO3")
34
+ WORKING_EVAL_SUBWORD = getenv_str("EVAL_SUBWORD", "wo_reasoning")
35
+
36
+ FORBIDDEN_SUBWORDS: List[str] = json.loads(getenv_str("FORBIDDEN_SUBWORDS_JSON", "[]"))
37
+ PARTICULAR = getenv_str("PARTICULAR", "")
38
+
39
+ BASE_PORT = getenv_int("BASE_PORT", 8002)
40
+
41
+ # Prefer explicit URL->ckpt mapping from RUNME.sh
42
+ MODELS_JSON_ENV = getenv_str("MODELS_JSON", "").strip()
43
+ if MODELS_JSON_ENV:
44
+ MODELS: Dict[str, int] = json.loads(MODELS_JSON_ENV)
45
+ MODELS = {str(k): int(v) for k, v in MODELS.items()}
46
+ else:
47
+ # Fallback sequential mapping (rarely used now)
48
+ checkpoints = json.loads(getenv_str("CKPTS_JSON", "[1000]"))
49
+ MODELS = {f"http://localhost:{BASE_PORT + i}/v1/chat/completions": int(checkpoints[i])
50
+ for i in range(len(checkpoints))}
51
+
52
+ MAX_WORKERS = min(16, max(1, len(MODELS)))
53
+
54
+
55
+ def thought_generator_with_local_LLM_requests(
56
+ message,
57
+ LLM_model,
58
+ LLM_max_new_tokens=128,
59
+ n=1,
60
+ API_URL="http://localhost:8000/v1/chat/completions",
61
+ timeout_sec=600,
62
+ stream=False,
63
+ ) -> str | list[Any] | Any:
64
+ # Your eval uses stream=False; keep it simple.
65
+ payload = {
66
+ "model": LLM_model,
67
+ "messages": message,
68
+ "n": n,
69
+ "max_tokens": LLM_max_new_tokens,
70
+ }
71
+
72
+ r = requests.post(
73
+ API_URL,
74
+ json=payload,
75
+ headers={"Content-Type": "application/json", "Authorization": "Bearer 0"},
76
+ timeout=timeout_sec,
77
+ )
78
+
79
+ if r.status_code != 200:
80
+ logger.error(f"LLM API error {r.status_code}: {r.text}")
81
+ raise RuntimeError(f"LLM API returned {r.status_code}")
82
+
83
+ data = r.json()
84
+ if n == 1:
85
+ return data["choices"][0]["message"]["content"]
86
+ return [c["message"]["content"] for c in data["choices"]]
87
+
88
+
89
+ def extract_label(response: str) -> str:
90
+ has_yes = "Yes" in response
91
+ has_no = "No" in response
92
+ if has_yes and not has_no:
93
+ return "Yes"
94
+ if has_no and not has_yes:
95
+ return "No"
96
+ return ""
97
+
98
+
99
+ def call_one_model(
100
+ model_url: str,
101
+ ckpt: int,
102
+ msgs,
103
+ gold_label: str,
104
+ ) -> Tuple[int, Dict[str, Any]]:
105
+ try:
106
+ response = thought_generator_with_local_LLM_requests(
107
+ message=msgs,
108
+ LLM_model="custom-model",
109
+ LLM_max_new_tokens=128,
110
+ n=1,
111
+ API_URL=model_url,
112
+ timeout_sec=300,
113
+ stream=False,
114
+ )
115
+ except Exception as e:
116
+ logger.error(f"Error getting response from model at {model_url}: {e}")
117
+ response = ""
118
+
119
+ label = extract_label(response)
120
+ return ckpt, {
121
+ "label": label,
122
+ "output": response,
123
+ "full_output": response,
124
+ "accuracy": 1 if label == gold_label else 0,
125
+ }
126
+
127
+
128
+ def entry_uid(system: str, prompt: str, gold_label: str, gold_output: str) -> str:
129
+ payload = {"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output}
130
+ s = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
131
+ return hashlib.sha1(s.encode("utf-8")).hexdigest()
132
+
133
+
134
+ def load_cache(path: str) -> Dict[str, Dict[str, Any]]:
135
+ if not os.path.exists(path):
136
+ return {}
137
+ try:
138
+ with open(path, "r") as f:
139
+ data = json.load(f)
140
+ cache = {}
141
+ for e in data:
142
+ uid = entry_uid(e.get("system", ""), e.get("prompt", ""), e.get("gold_label", ""), e.get("gold_output", ""))
143
+ cache[uid] = e
144
+ logger.info(f"Loaded cache from {path}: {len(cache)} entries")
145
+ return cache
146
+ except Exception as ex:
147
+ logger.warning(f"Failed to load cache from {path} (starting fresh): {ex}")
148
+ return {}
149
+
150
+
151
+ def should_run_step(o_entry: Dict[str, Any], ckpt: int) -> bool:
152
+ key = f"step_{ckpt}"
153
+ if key not in o_entry:
154
+ return True
155
+ v = o_entry.get(key) or {}
156
+ out = v.get("output", "")
157
+ return not isinstance(out, str) or out.strip() == ""
158
+
159
+
160
+ def atomic_write_json(path: str, obj: Any) -> None:
161
+ tmp = path + ".tmp"
162
+ with open(tmp, "w") as f:
163
+ json.dump(obj, f, indent=2, ensure_ascii=False)
164
+ os.replace(tmp, path)
165
+
166
+
167
+ def should_process_file(filename: str) -> bool:
168
+ if WORKING_EVAL_SUBWORD and WORKING_EVAL_SUBWORD not in filename:
169
+ return False
170
+ if any(sub in filename for sub in FORBIDDEN_SUBWORDS):
171
+ return False
172
+ if PARTICULAR and PARTICULAR not in filename:
173
+ return False
174
+ return filename.endswith(".json")
175
+
176
+
177
+ if __name__ == "__main__":
178
+ logger.info(f"WORKING_DIR={WORKING_DIR}")
179
+ logger.info(f"SAVE_DIR={SAVE_DIR}")
180
+ logger.info(f"MODELS={MODELS}")
181
+ logger.info(f"MAX_WORKERS={MAX_WORKERS}")
182
+
183
+ if not MODELS:
184
+ print("No models to evaluate (MODELS is empty). Exiting.")
185
+ raise SystemExit(0)
186
+
187
+ os.makedirs(SAVE_DIR, exist_ok=True)
188
+
189
+ for original_eval_log_file in os.listdir(WORKING_DIR):
190
+ if not should_process_file(original_eval_log_file):
191
+ continue
192
+ print(f"Working in {original_eval_log_file}")
193
+
194
+ original_eval_file = os.path.join(WORKING_DIR, original_eval_log_file)
195
+ output_eval_file = os.path.join(SAVE_DIR, original_eval_log_file.replace(".json", "_results.json"))
196
+
197
+ with open(original_eval_file, "r") as f:
198
+ eval_data: list[dict] = json.load(f)
199
+
200
+ cache_map = load_cache(output_eval_file)
201
+ output_eval_data = []
202
+
203
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
204
+ for idx, entry in enumerate(tqdm(eval_data)):
205
+ system = entry["system"]
206
+ prompt = entry["prompt"]
207
+ gold_label = entry["gold_label"]
208
+ gold_output = entry["gold_output"]
209
+
210
+ uid = entry_uid(system, prompt, gold_label, gold_output)
211
+ o_entry = cache_map.get(uid, {})
212
+ o_entry.update({"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output})
213
+
214
+ msgs = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]
215
+
216
+ futures = []
217
+ for model_url, ckpt in MODELS.items():
218
+ if should_run_step(o_entry, ckpt):
219
+ futures.append(executor.submit(call_one_model, model_url, ckpt, msgs, gold_label))
220
+
221
+ for fut in as_completed(futures):
222
+ ckpt, result = fut.result()
223
+ o_entry[f"step_{ckpt}"] = result
224
+
225
+ output_eval_data.append(o_entry)
226
+
227
+ if (idx + 1) % 50 == 0:
228
+ atomic_write_json(output_eval_file, output_eval_data)
229
+
230
+ atomic_write_json(output_eval_file, output_eval_data)
231
+
232
+ print("Evaluation with checkpoints completed.")
A/.ipynb_checkpoints/trainer_log-checkpoint.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/10k_port8006_gpu0_20251224_032906_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/10k_port8006_gpu0_20251229_035755_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/10k_port8006_gpu0_20251229_035755_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 4828
A/logs/A/1k_port8002_gpu0_20251229_060558_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/1k_port8002_gpu0_20251229_060558_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 263
A/logs/A/2k_port8003_gpu0_20251229_035755_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/2k_port8003_gpu0_20251229_035755_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 738
A/logs/A/2k_port8003_gpu0_20251229_060558_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/3k_port8004_gpu0_20251224_032906_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/3k_port8004_gpu0_20251229_035755_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/3k_port8004_gpu0_20251229_035755_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 1123
A/logs/A/3k_port8004_gpu0_20251229_060558_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/4k_port8005_gpu0_20251229_035755_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/5k_port8006_gpu0_20251224_032906_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/5k_port8006_gpu0_20251229_035755_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/5k_port8006_gpu0_20251229_035755_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 1994
A/logs/A/6k_port8002_gpu0_20251224_032906_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/6k_port8002_gpu0_20251229_035755_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/6k_port8002_gpu0_20251229_035755_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 3293
A/logs/A/7k_port8003_gpu0_20251224_032906_batch2.log ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,923 >> loading file tokenizer.json
2
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,923 >> loading file tokenizer.model
3
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,923 >> loading file added_tokens.json
4
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,924 >> loading file special_tokens_map.json
5
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,924 >> loading file tokenizer_config.json
6
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:42,924 >> loading file chat_template.jinja
7
+ [INFO|tokenization_utils_base.py:2364] 2025-12-24 03:36:44,109 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
8
+ [INFO|configuration_utils.py:763] 2025-12-24 03:36:44,121 >> loading configuration file /workspace/meta-llama/Llama-3.1-8B-Instruct/config.json
9
+ [INFO|configuration_utils.py:839] 2025-12-24 03:36:44,130 >> Model config LlamaConfig {
10
+ "architectures": [
11
+ "LlamaForCausalLM"
12
+ ],
13
+ "attention_bias": false,
14
+ "attention_dropout": 0.0,
15
+ "bos_token_id": 128000,
16
+ "dtype": "bfloat16",
17
+ "eos_token_id": [
18
+ 128001,
19
+ 128008,
20
+ 128009
21
+ ],
22
+ "head_dim": 128,
23
+ "hidden_act": "silu",
24
+ "hidden_size": 4096,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 14336,
27
+ "max_position_embeddings": 131072,
28
+ "mlp_bias": false,
29
+ "model_type": "llama",
30
+ "num_attention_heads": 32,
31
+ "num_hidden_layers": 32,
32
+ "num_key_value_heads": 8,
33
+ "pretraining_tp": 1,
34
+ "rms_norm_eps": 1e-05,
35
+ "rope_scaling": {
36
+ "factor": 8.0,
37
+ "high_freq_factor": 4.0,
38
+ "low_freq_factor": 1.0,
39
+ "original_max_position_embeddings": 8192,
40
+ "rope_type": "llama3"
41
+ },
42
+ "rope_theta": 500000.0,
43
+ "tie_word_embeddings": false,
44
+ "transformers_version": "4.57.1",
45
+ "use_cache": true,
46
+ "vocab_size": 128256
47
+ }
48
+
49
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,144 >> loading file tokenizer.json
50
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,144 >> loading file tokenizer.model
51
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,144 >> loading file added_tokens.json
52
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,144 >> loading file special_tokens_map.json
53
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,144 >> loading file tokenizer_config.json
54
+ [INFO|tokenization_utils_base.py:2093] 2025-12-24 03:36:44,145 >> loading file chat_template.jinja
55
+ [INFO|tokenization_utils_base.py:2364] 2025-12-24 03:36:45,274 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
56
+ [INFO|2025-12-24 03:36:45] llamafactory.data.template:143 >> Add pad token: <|eot_id|>
57
+ [INFO|2025-12-24 03:36:45] llamafactory.data.template:143 >> Add <|eom_id|> to stop words.
58
+ [INFO|configuration_utils.py:763] 2025-12-24 03:36:45,337 >> loading configuration file /workspace/meta-llama/Llama-3.1-8B-Instruct/config.json
59
+ [INFO|configuration_utils.py:839] 2025-12-24 03:36:45,340 >> Model config LlamaConfig {
60
+ "architectures": [
61
+ "LlamaForCausalLM"
62
+ ],
63
+ "attention_bias": false,
64
+ "attention_dropout": 0.0,
65
+ "bos_token_id": 128000,
66
+ "dtype": "bfloat16",
67
+ "eos_token_id": [
68
+ 128001,
69
+ 128008,
70
+ 128009
71
+ ],
72
+ "head_dim": 128,
73
+ "hidden_act": "silu",
74
+ "hidden_size": 4096,
75
+ "initializer_range": 0.02,
76
+ "intermediate_size": 14336,
77
+ "max_position_embeddings": 131072,
78
+ "mlp_bias": false,
79
+ "model_type": "llama",
80
+ "num_attention_heads": 32,
81
+ "num_hidden_layers": 32,
82
+ "num_key_value_heads": 8,
83
+ "pretraining_tp": 1,
84
+ "rms_norm_eps": 1e-05,
85
+ "rope_scaling": {
86
+ "factor": 8.0,
87
+ "high_freq_factor": 4.0,
88
+ "low_freq_factor": 1.0,
89
+ "original_max_position_embeddings": 8192,
90
+ "rope_type": "llama3"
91
+ },
92
+ "rope_theta": 500000.0,
93
+ "tie_word_embeddings": false,
94
+ "transformers_version": "4.57.1",
95
+ "use_cache": true,
96
+ "vocab_size": 128256
97
+ }
98
+
99
+ [WARNING|logging.py:328] 2025-12-24 03:36:45,340 >> `torch_dtype` is deprecated! Use `dtype` instead!
100
+ [INFO|2025-12-24 03:36:45] llamafactory.model.model_utils.kv_cache:143 >> KV cache is enabled for faster generation.
101
+ [WARNING|logging.py:328] 2025-12-24 03:36:45,845 >> `torch_dtype` is deprecated! Use `dtype` instead!
102
+ [INFO|modeling_utils.py:1169] 2025-12-24 03:36:45,849 >> loading weights file /workspace/meta-llama/Llama-3.1-8B-Instruct/model.safetensors.index.json
103
+ [INFO|modeling_utils.py:2341] 2025-12-24 03:36:45,853 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
104
+ [INFO|configuration_utils.py:986] 2025-12-24 03:36:45,857 >> Generate config GenerationConfig {
105
+ "bos_token_id": 128000,
106
+ "eos_token_id": [
107
+ 128001,
108
+ 128008,
109
+ 128009
110
+ ]
111
+ }
112
+
113
+
114
+ [INFO|configuration_utils.py:939] 2025-12-24 03:36:52,687 >> loading configuration file /workspace/meta-llama/Llama-3.1-8B-Instruct/generation_config.json
115
+ [INFO|configuration_utils.py:986] 2025-12-24 03:36:52,689 >> Generate config GenerationConfig {
116
+ "bos_token_id": 128000,
117
+ "eos_token_id": [
118
+ 128001,
119
+ 128008,
120
+ 128009
121
+ ]
122
+ }
123
+
124
+ [INFO|dynamic_module_utils.py:423] 2025-12-24 03:36:52,691 >> Could not locate the custom_generate/generate.py inside /workspace/meta-llama/Llama-3.1-8B-Instruct.
125
+ [INFO|2025-12-24 03:36:52] llamafactory.model.model_utils.attention:143 >> Using torch SDPA for faster training and inference.
126
+ [INFO|2025-12-24 03:37:19] llamafactory.model.adapter:143 >> Merged 1 adapter(s).
127
+ [INFO|2025-12-24 03:37:19] llamafactory.model.adapter:143 >> Loaded adapter(s): /workspace/v121rc_exp1/A/checkpoint-7000
128
+ [INFO|2025-12-24 03:37:19] llamafactory.model.loader:143 >> all params: 8,030,261,248
129
+ Visit http://localhost:8003/docs for API document.
130
+ INFO: Started server process [6730]
131
+ INFO: Waiting for application startup.
132
+ INFO: Application startup complete.
133
+ ERROR: [Errno 98] error while attempting to bind on address ('0.0.0.0', 8003): address already in use
134
+ INFO: Waiting for application shutdown.
135
+ INFO: Application shutdown complete.
A/logs/A/7k_port8003_gpu0_20251229_035755_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/7k_port8003_gpu0_20251229_035755_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 3677
A/logs/A/8k_port8004_gpu0_20251224_032906_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/8k_port8004_gpu0_20251229_035755_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/8k_port8004_gpu0_20251229_035755_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 4060
A/logs/A/9k_port8005_gpu0_20251224_032906_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/9k_port8005_gpu0_20251224_032906_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 7204
A/logs/A/9k_port8005_gpu0_20251229_035755_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
A/logs/A/9k_port8005_gpu0_20251229_035755_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 4443
C/logs/C/1k_port8002_gpu0_20251223_091224_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/1k_port8002_gpu0_20251223_091224_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 2672
C/logs/C/1k_port8002_gpu0_20251223_141442_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/1k_port8002_gpu0_20251223_141442_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 524
C/logs/C/2k_port8003_gpu0_20251223_091224_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 3287
C/logs/C/2k_port8003_gpu0_20251223_141442_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/2k_port8003_gpu0_20251223_141442_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 1362
C/logs/C/3k_port8004_gpu0_20251223_091224_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/3k_port8004_gpu0_20251223_091224_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 3662
C/logs/C/3k_port8004_gpu0_20251223_141442_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/3k_port8004_gpu0_20251223_141442_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 1747
C/logs/C/4k_port8005_gpu0_20251223_091224_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/4k_port8005_gpu0_20251223_091224_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 4013
C/logs/C/5k_port8006_gpu0_20251223_091224_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/5k_port8006_gpu0_20251223_091224_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 4367
C/logs/C/6k_port8002_gpu0_20251223_141442_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
C/logs/C/7k_port8003_gpu0_20251223_141442_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
H.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bf16: true
2
+ cutoff_len: 128
3
+ # dataset: HNO1_train_wo_reasoning
4
+ dataset: HNO1_train
5
+ # dataset: HNO1_train_fake_reasoning
6
+ # eval_dataset:
7
+ dataset_dir: /workspace/LLaMA-Factory/data
8
+ ddp_timeout: 180000000
9
+ # deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
10
+ do_train: true
11
+ do_eval: false
12
+ enable_thinking: false
13
+ # eval_steps: 100
14
+ # eval_strategy: steps
15
+
16
+ finetuning_type: lora
17
+ lora_alpha: 16
18
+ lora_rank: 8
19
+ lora_dropout: 0.05
20
+ lora_target: all
21
+
22
+
23
+ flash_attn: auto
24
+ gradient_accumulation_steps: 1
25
+ include_num_input_tokens_seen: true
26
+ learning_rate: 5e-5
27
+ logging_steps: 1
28
+ lr_scheduler_type: constant_with_warmup
29
+ max_grad_norm: 2
30
+ max_samples: 100000000
31
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
32
+ num_train_epochs: 100000000
33
+ optim: adamw_torch
34
+ output_dir: /workspace/v121rc_exp1/H
35
+ packing: false
36
+ # per_device_eval_batch_size: 64
37
+ per_device_train_batch_size: 64
38
+ plot_loss: true
39
+ preprocessing_num_workers: 16
40
+ report_to: wandb
41
+ save_steps: 1000
42
+ stage: sft
43
+ template: llama3
44
+ trust_remote_code: true
45
+ #val_size: 0.5
46
+ warmup_steps: 10
47
+ resize_vocab: true
48
+ weight_decay: 1
49
+ adam_beta1: 0.9
50
+ adam_beta2: 0.98
51
+ # eval_on_each_dataset: true
52
+ # compute_accuracy: true
53
+ # accuracy_at_last_token: true
54
+ # accuracy_with_generate: true
55
+
56
+ # predict_with_generate: true
57
+ # do_sample: false
58
+ # temperature: 0.0
59
+ # top_p: 1.0
60
+ # max_new_tokens: 1024
61
+ # group_by_length: false
62
+
63
+ # add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>