Linksome commited on
Commit
facab9d
·
verified ·
1 Parent(s): 97075a6

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. F/10k.yaml +6 -0
  2. F/1k.yaml +6 -0
  3. F/2k.yaml +6 -0
  4. F/3k.yaml +6 -0
  5. F/7k.yaml +6 -0
  6. F/9k.yaml +6 -0
  7. F/PandaEval12_1_results/.ipynb_checkpoints/HNO2_eval_fake_reasoning_P4_results-checkpoint.json +0 -0
  8. F/PandaEval12_1_results/HNO2_eval_fake_reasoning_A2_results.json +0 -0
  9. F/PandaEval12_1_results/HNO2_eval_fake_reasoning_P4_results.json +0 -0
  10. F/PandaEval12_1_results/HNO2_eval_fake_reasoning_R2_results.json +0 -0
  11. F/PandaEval12_1_results/HNO2_eval_fake_reasoning_R3_results.json +0 -0
  12. F/PandaEval12_1_results/HNO2_eval_fake_reasoning_results.json +0 -0
  13. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A1_results.json +0 -0
  14. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A2_results.json +0 -0
  15. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A3_results.json +0 -0
  16. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A4_results.json +0 -0
  17. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P2_results.json +0 -0
  18. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P3_results.json +0 -0
  19. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P4_results.json +0 -0
  20. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P5_results.json +0 -0
  21. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R1_results.json +0 -0
  22. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R2_results.json +0 -0
  23. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R3_results.json +0 -0
  24. F/PandaEval12_2_results/HNO2_eval_fake_reasoning_results.json +0 -0
  25. F/RUNME.sh +386 -0
  26. F/logs/F/10k_port8006_gpu0_20251229_060656_batch2.log.pid +1 -0
  27. F/logs/F/6k_port8002_gpu0_20251229_060656_batch2.log.pid +1 -0
  28. F/logs/F/7k_port8003_gpu0_20251229_060656_batch2.log.pid +1 -0
  29. F/logs/F/8k_port8004_gpu0_20251229_060656_batch2.log.pid +1 -0
  30. F/logs/F/9k_port8005_gpu0_20251229_060656_batch2.log.pid +1 -0
  31. F/runF.py +232 -0
  32. F/trainer_log.jsonl +0 -0
  33. G/checkpoint-10000/adapter_config.json +45 -0
  34. G/checkpoint-10000/trainer_state.json +0 -0
  35. G/logs/G/.ipynb_checkpoints/1k_port8002_gpu0_20251224_032331_batch1-checkpoint.log +0 -0
  36. G/logs/G/10k_port8003_gpu0_20251229_035833_batch3.log +0 -0
  37. G/logs/G/10k_port8003_gpu0_20251229_060759_batch3.log +0 -0
  38. G/logs/G/1k_port8002_gpu0_20251229_035833_batch2.log +0 -0
  39. G/logs/G/1k_port8002_gpu0_20251229_035833_batch2.log.pid +1 -0
  40. G/logs/G/2k_port8003_gpu0_20251229_060759_batch1.log +0 -0
  41. G/logs/G/3k_port8004_gpu0_20251229_035833_batch1.log.pid +1 -0
  42. G/logs/G/3k_port8004_gpu0_20251229_060759_batch1.log +0 -0
  43. G/logs/G/5k_port8006_gpu0_20251229_035833_batch1.log +0 -0
  44. G/logs/G/5k_port8006_gpu0_20251229_035833_batch1.log.pid +1 -0
  45. G/logs/G/6k_port8003_gpu0_20251229_060759_batch2.log +0 -0
  46. G/logs/G/7k_port8004_gpu0_20251229_035833_batch2.log +0 -0
  47. G/logs/G/7k_port8004_gpu0_20251229_060759_batch2.log +0 -0
  48. G/logs/G/8k_port8005_gpu0_20251229_035833_batch2.log +0 -0
  49. G/logs/G/8k_port8005_gpu0_20251229_060759_batch2.log +0 -0
  50. G/logs/G/9k_port8006_gpu0_20251229_035833_batch2.log +0 -0
F/10k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-10000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/1k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-1000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/2k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-2000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/3k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-3000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/7k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-7000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/9k.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
2
+ adapter_name_or_path: /workspace/v121rc_exp1/F/checkpoint-9000
3
+ template: llama3
4
+ finetuning_type: lora
5
+ infer_backend: huggingface
6
+ trust_remote_code: true
F/PandaEval12_1_results/.ipynb_checkpoints/HNO2_eval_fake_reasoning_P4_results-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_1_results/HNO2_eval_fake_reasoning_A2_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_1_results/HNO2_eval_fake_reasoning_P4_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_1_results/HNO2_eval_fake_reasoning_R2_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_1_results/HNO2_eval_fake_reasoning_R3_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_1_results/HNO2_eval_fake_reasoning_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A1_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A2_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A3_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_A4_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P2_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P3_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P4_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_P5_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R1_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R2_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_R3_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/PandaEval12_2_results/HNO2_eval_fake_reasoning_results.json ADDED
The diff for this file is too large to render. See raw diff
 
F/RUNME.sh ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # -----------------------------
5
+ # User config
6
+ # -----------------------------
7
+ config="F"
8
+ CONFIG_DIR="/workspace/v121rc_exp1/${config}"
9
+
10
+ # YAML generation defaults
11
+ MODEL_NAME_OR_PATH="/workspace/meta-llama/Llama-3.1-8B-Instruct"
12
+ TEMPLATE="llama3"
13
+ FINETUNING_TYPE="lora"
14
+ INFER_BACKEND="huggingface"
15
+ TRUST_REMOTE_CODE="true"
16
+
17
+ # Launch config
18
+ BASE_PORT=8002
19
+ SLEEP_BETWEEN_LAUNCHES_SEC=10
20
+ VRAM_THRESHOLD_PCT=80 # if GPU >= threshold after launch, try next GPU for next ckpt
21
+ BATCH_MIN_MODELS=1 # start eval once at least this many services are up
22
+
23
+ # Eval config (passed to python)
24
+ PYTHON_EVAL="/workspace/v121rc_exp1/F/runF.py"
25
+ EVAL_WORKING_DIR="/workspace/v121rc_exp1/PandaEval12_2/HNO2"
26
+ EVAL_SUBWORD="fake_reasoning"
27
+ FORBIDDEN_SUBWORDS_JSON="[]"
28
+ PARTICULAR=""
29
+ SAVE_DIR="${CONFIG_DIR}"
30
+
31
+ # Always stop services between batches to free VRAM
32
+ STOP_SERVICES_BETWEEN_BATCHES="true"
33
+
34
+ # -----------------------------
35
+ # Setup logging
36
+ # -----------------------------
37
+ LOG_ROOT="${CONFIG_DIR}/logs"
38
+ mkdir -p "${LOG_ROOT}/${config}"
39
+ timestamp=$(date +"%Y%m%d_%H%M%S")
40
+
41
+ # -----------------------------
42
+ # Helpers
43
+ # -----------------------------
44
+ require_cmd() {
45
+ command -v "$1" >/dev/null 2>&1 || { echo "ERROR: missing command: $1" >&2; exit 1; }
46
+ }
47
+ require_cmd nvidia-smi
48
+ require_cmd python
49
+ require_cmd curl
50
+ require_cmd sort
51
+ require_cmd awk
52
+
53
+ num_gpus() {
54
+ nvidia-smi -L | wc -l | awk '{print $1}'
55
+ }
56
+
57
+ gpu_mem_pct() {
58
+ local gpu="$1"
59
+ nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits -i "${gpu}" \
60
+ | awk -F',' '{used=$1; total=$2; if (total==0) {print 100} else {printf("%d\n", (used/total)*100)} }'
61
+ }
62
+
63
+ launch_service () {
64
+ local gpu="$1"
65
+ local api_port="$2"
66
+ local yaml_path="$3"
67
+ local log_file="$4"
68
+ local pid_file="$5"
69
+
70
+ echo "Starting (GPU ${gpu}) port ${api_port} : ${yaml_path}"
71
+ echo "Log: ${log_file}"
72
+
73
+ API_PORT="${api_port}" CUDA_VISIBLE_DEVICES="${gpu}" \
74
+ llamafactory-cli api "${yaml_path}" \
75
+ > "${log_file}" 2>&1 &
76
+
77
+ echo $! > "${pid_file}"
78
+ }
79
+
80
+ wait_for_endpoint () {
81
+ local port="$1"
82
+ local url="http://localhost:${port}/v1/models"
83
+
84
+ for attempt in {1..120}; do
85
+ if curl -sS -m 2 "${url}" >/dev/null 2>&1; then
86
+ echo " ready: ${url}"
87
+ return 0
88
+ fi
89
+ sleep 2
90
+ done
91
+
92
+ echo "ERROR: Endpoint did not become ready: ${url}" >&2
93
+ return 1
94
+ }
95
+
96
+ stop_batch_services () {
97
+ local pidfiles=("$@")
98
+ echo "Stopping batch services: ${#pidfiles[@]} processes"
99
+ for pf in "${pidfiles[@]}"; do
100
+ [[ -f "${pf}" ]] || continue
101
+ pid="$(cat "${pf}" || true)"
102
+ if [[ -n "${pid}" ]] && kill -0 "${pid}" >/dev/null 2>&1; then
103
+ kill "${pid}" || true
104
+ fi
105
+ done
106
+ }
107
+
108
+ # -----------------------------
109
+ # Discover checkpoints
110
+ # -----------------------------
111
+ discover_checkpoints_json () {
112
+ shopt -s nullglob
113
+ local ckpt_dirs=( "${CONFIG_DIR}"/checkpoint-* )
114
+ if (( ${#ckpt_dirs[@]} == 0 )); then
115
+ echo "ERROR: No checkpoint-* folders found under: ${CONFIG_DIR}" >&2
116
+ exit 1
117
+ fi
118
+
119
+ mapfile -t ckpt_dirs < <(printf "%s\n" "${ckpt_dirs[@]}" | sort -V)
120
+
121
+ local ckpts=()
122
+ for ckpt_dir in "${ckpt_dirs[@]}"; do
123
+ local base step
124
+ base="$(basename "${ckpt_dir}")"
125
+ step="${base#checkpoint-}"
126
+ if [[ "${step}" =~ ^[0-9]+$ ]]; then
127
+ ckpts+=( "${step}" )
128
+ fi
129
+ done
130
+
131
+ local json="["
132
+ for i in "${!ckpts[@]}"; do
133
+ (( i>0 )) && json+=", "
134
+ json+="${ckpts[$i]}"
135
+ done
136
+ json+="]"
137
+ echo "${json}"
138
+ }
139
+
140
+ # -----------------------------
141
+ # Compute which checkpoints still need launching (resume-aware)
142
+ # -----------------------------
143
+ compute_needed_checkpoints_json () {
144
+ local all_ckpts_json="$1"
145
+
146
+ python - "${all_ckpts_json}" <<'PY'
147
+ import os, json, sys
148
+
149
+ CONFIG_DIR = os.environ.get("CONFIG_DIR")
150
+ SAVE_DIR = os.environ.get("SAVE_DIR", CONFIG_DIR)
151
+ WORKING_DIR = os.environ.get("EVAL_WORKING_DIR")
152
+ SUBWORD = os.environ.get("EVAL_SUBWORD", "")
153
+ FORBIDDEN = json.loads(os.environ.get("FORBIDDEN_SUBWORDS_JSON", "[]"))
154
+ PARTICULAR = os.environ.get("PARTICULAR", "")
155
+
156
+ all_ckpts = json.loads(sys.argv[1])
157
+
158
+ def should_process(fn: str) -> bool:
159
+ if SUBWORD and SUBWORD not in fn:
160
+ return False
161
+ if any(s in fn for s in FORBIDDEN):
162
+ return False
163
+ if PARTICULAR and PARTICULAR not in fn:
164
+ return False
165
+ return fn.endswith(".json")
166
+
167
+ eval_files = sorted([fn for fn in os.listdir(WORKING_DIR) if should_process(fn)])
168
+ if not eval_files:
169
+ print(json.dumps(all_ckpts))
170
+ raise SystemExit(0)
171
+
172
+ def file_complete_for_ckpt(eval_file: str, ckpt: int) -> bool:
173
+ in_path = os.path.join(WORKING_DIR, eval_file)
174
+ out_path = os.path.join(SAVE_DIR, eval_file.replace(".json", "_results.json"))
175
+ if not os.path.exists(out_path):
176
+ return False
177
+ try:
178
+ with open(in_path, "r") as f:
179
+ in_data = json.load(f)
180
+ with open(out_path, "r") as f:
181
+ out_data = json.load(f)
182
+ except Exception:
183
+ return False
184
+
185
+ if not isinstance(in_data, list) or not isinstance(out_data, list):
186
+ return False
187
+ if len(out_data) != len(in_data):
188
+ return False
189
+
190
+ key = f"step_{ckpt}"
191
+ for e in out_data:
192
+ v = e.get(key) or {}
193
+ out = v.get("output", "")
194
+ if not isinstance(out, str) or out.strip() == "":
195
+ return False
196
+ return True
197
+
198
+ needed = []
199
+ for ckpt in all_ckpts:
200
+ done_everywhere = True
201
+ for ef in eval_files:
202
+ if not file_complete_for_ckpt(ef, ckpt):
203
+ done_everywhere = False
204
+ break
205
+ if not done_everywhere:
206
+ needed.append(ckpt)
207
+
208
+ print(json.dumps(needed))
209
+ PY
210
+ }
211
+
212
+ # -----------------------------
213
+ # Generate YAML for one checkpoint
214
+ # -----------------------------
215
+ write_yaml_for_ckpt () {
216
+ local step="$1"
217
+
218
+ python - "${step}" <<'PY'
219
+ import os, sys
220
+ step = int(sys.argv[1])
221
+
222
+ CONFIG_DIR = os.environ["CONFIG_DIR"]
223
+ MODEL = os.environ["MODEL_NAME_OR_PATH"]
224
+ TEMPLATE = os.environ["TEMPLATE"]
225
+ FINETUNING_TYPE = os.environ["FINETUNING_TYPE"]
226
+ INFER_BACKEND = os.environ["INFER_BACKEND"]
227
+ TRUST_REMOTE_CODE = os.environ["TRUST_REMOTE_CODE"]
228
+
229
+ ckpt_dir = os.path.join(CONFIG_DIR, f"checkpoint-{step}")
230
+ if not os.path.isdir(ckpt_dir):
231
+ raise SystemExit(f"Missing checkpoint dir: {ckpt_dir}")
232
+
233
+ name = f"{step//1000}k" if step % 1000 == 0 else str(step)
234
+ yaml_path = os.path.join(CONFIG_DIR, f"{name}.yaml")
235
+
236
+ with open(yaml_path, "w") as f:
237
+ f.write(
238
+ f"model_name_or_path: {MODEL}\n"
239
+ f"adapter_name_or_path: {ckpt_dir}\n"
240
+ f"template: {TEMPLATE}\n"
241
+ f"finetuning_type: {FINETUNING_TYPE}\n"
242
+ f"infer_backend: {INFER_BACKEND}\n"
243
+ f"trust_remote_code: {TRUST_REMOTE_CODE}\n"
244
+ )
245
+ print(yaml_path)
246
+ PY
247
+ }
248
+
249
+ # -----------------------------
250
+ # Main (batch loop)
251
+ # -----------------------------
252
+ export CONFIG_DIR
253
+ export SAVE_DIR
254
+ export EVAL_WORKING_DIR
255
+ export EVAL_SUBWORD
256
+ export FORBIDDEN_SUBWORDS_JSON
257
+ export PARTICULAR
258
+
259
+ export MODEL_NAME_OR_PATH
260
+ export TEMPLATE
261
+ export FINETUNING_TYPE
262
+ export INFER_BACKEND
263
+ export TRUST_REMOTE_CODE
264
+
265
+ ALL_CKPTS_JSON="$(discover_checkpoints_json)"
266
+ GPU_COUNT="$(num_gpus)"
267
+ echo "Detected GPUs: ${GPU_COUNT}"
268
+ echo "All checkpoints found: ${ALL_CKPTS_JSON}"
269
+
270
+ batch_idx=0
271
+
272
+ while true; do
273
+ NEEDED_CKPTS_JSON="$(compute_needed_checkpoints_json "${ALL_CKPTS_JSON}")"
274
+ echo "Still needed checkpoints: ${NEEDED_CKPTS_JSON}"
275
+
276
+ if [[ "${NEEDED_CKPTS_JSON}" == "[]" ]]; then
277
+ echo "All checkpoints complete across outputs. Done."
278
+ exit 0
279
+ fi
280
+
281
+ batch_idx=$((batch_idx + 1))
282
+ echo "=============================="
283
+ echo "Batch ${batch_idx}: launching what fits under VRAM threshold (${VRAM_THRESHOLD_PCT}%)"
284
+ echo "=============================="
285
+
286
+ # Parse needed list into bash array
287
+ mapfile -t NEEDED_LIST < <(python - "${NEEDED_CKPTS_JSON}" <<'PY'
288
+ import json, sys
289
+ for x in json.loads(sys.argv[1]):
290
+ print(int(x))
291
+ PY
292
+ )
293
+
294
+ MODELS_JSON="{"
295
+ first=1
296
+ launched=0
297
+
298
+ # track launched service pidfiles to stop after batch
299
+ batch_pidfiles=()
300
+
301
+ port="${BASE_PORT}"
302
+ gpu=0
303
+
304
+ for ckpt in "${NEEDED_LIST[@]}"; do
305
+ # Find a GPU with headroom; if none, stop launching more in this batch.
306
+ found_gpu="false"
307
+ for ((try=0; try<GPU_COUNT; try++)); do
308
+ pct="$(gpu_mem_pct "${gpu}")"
309
+ if (( pct < VRAM_THRESHOLD_PCT )); then
310
+ found_gpu="true"
311
+ break
312
+ fi
313
+ gpu=$((gpu + 1))
314
+ if (( gpu >= GPU_COUNT )); then gpu=0; fi
315
+ done
316
+
317
+ if [[ "${found_gpu}" != "true" ]]; then
318
+ echo "No GPU under ${VRAM_THRESHOLD_PCT}% VRAM. Stop launching; start eval with current batch."
319
+ break
320
+ fi
321
+
322
+ yaml_path="$(write_yaml_for_ckpt "${ckpt}")"
323
+ tag="$(basename "${yaml_path}" .yaml)"
324
+ log_file="${LOG_ROOT}/${config}/${tag}_port${port}_gpu${gpu}_${timestamp}_batch${batch_idx}.log"
325
+ pid_file="${log_file}.pid"
326
+
327
+ launch_service "${gpu}" "${port}" "${yaml_path}" "${log_file}" "${pid_file}"
328
+ batch_pidfiles+=( "${pid_file}" )
329
+
330
+ if ! wait_for_endpoint "${port}"; then
331
+ echo "Endpoint failed on port ${port}; stopping batch and exiting."
332
+ stop_batch_services "${batch_pidfiles[@]}"
333
+ exit 1
334
+ fi
335
+
336
+ url="http://localhost:${port}/v1/chat/completions"
337
+ if (( first == 1 )); then
338
+ MODELS_JSON+="\"${url}\": ${ckpt}"
339
+ first=0
340
+ else
341
+ MODELS_JSON+=", \"${url}\": ${ckpt}"
342
+ fi
343
+
344
+ launched=$((launched + 1))
345
+
346
+ pct_after="$(gpu_mem_pct "${gpu}")"
347
+ echo "GPU ${gpu} VRAM after launch: ${pct_after}%"
348
+ if (( pct_after >= VRAM_THRESHOLD_PCT )); then
349
+ gpu=$((gpu + 1))
350
+ if (( gpu >= GPU_COUNT )); then gpu=0; fi
351
+ fi
352
+
353
+ port=$((port + 1))
354
+ echo "Sleeping ${SLEEP_BETWEEN_LAUNCHES_SEC}s to avoid VRAM spikes..."
355
+ sleep "${SLEEP_BETWEEN_LAUNCHES_SEC}"
356
+ done
357
+
358
+ MODELS_JSON+="}"
359
+ echo "Launched models in batch ${batch_idx}: ${launched}"
360
+ echo "MODELS_JSON=${MODELS_JSON}"
361
+
362
+ if (( launched < BATCH_MIN_MODELS )); then
363
+ echo "ERROR: Could not launch even ${BATCH_MIN_MODELS} model(s) under VRAM threshold."
364
+ echo "Either increase VRAM_THRESHOLD_PCT, reduce model size, or free VRAM."
365
+ exit 1
366
+ fi
367
+
368
+ # Run eval for this batch
369
+ export MODELS_JSON
370
+ export CKPTS_JSON="[]" # unused when MODELS_JSON exists, but keep it defined
371
+ export BASE_PORT="${BASE_PORT}"
372
+
373
+ echo "Running eval for batch ${batch_idx}: python ${PYTHON_EVAL}"
374
+ python "${PYTHON_EVAL}"
375
+
376
+ # Stop services to free VRAM for next batch
377
+ if [[ "${STOP_SERVICES_BETWEEN_BATCHES}" == "true" ]]; then
378
+ stop_batch_services "${batch_pidfiles[@]}"
379
+ echo "Batch ${batch_idx} services stopped."
380
+ # give GPU a moment to release memory
381
+ sleep 5
382
+ else
383
+ echo "Leaving batch services running (not recommended for batch mode)."
384
+ echo "This may prevent future batches from launching due to VRAM saturation."
385
+ fi
386
+ done
F/logs/F/10k_port8006_gpu0_20251229_060656_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 65625
F/logs/F/6k_port8002_gpu0_20251229_060656_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 63689
F/logs/F/7k_port8003_gpu0_20251229_060656_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 64298
F/logs/F/8k_port8004_gpu0_20251229_060656_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 64781
F/logs/F/9k_port8005_gpu0_20251229_060656_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 65142
F/runF.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import hashlib
4
+ from typing import Any, Dict, Tuple, List
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ from tqdm import tqdm
8
+ import requests
9
+ from loguru import logger
10
+
11
+
12
+ def getenv_str(key: str, default: str) -> str:
13
+ v = os.environ.get(key)
14
+ return default if v is None else v
15
+
16
+
17
+ def getenv_int(key: str, default: int) -> int:
18
+ v = os.environ.get(key)
19
+ if v is None or v.strip() == "":
20
+ return default
21
+ try:
22
+ return int(v)
23
+ except ValueError:
24
+ raise ValueError(f"Env var {key} must be int, got: {v!r}")
25
+
26
+
27
+ # ----------------------------
28
+ # Read config from environment
29
+ # ----------------------------
30
+ CONFIG_DIR = getenv_str("CONFIG_DIR", "/workspace/v121rc_exp1/F")
31
+ SAVE_DIR = getenv_str("SAVE_DIR", CONFIG_DIR)
32
+
33
+ WORKING_DIR = getenv_str("EVAL_WORKING_DIR", "/workspace/v121rc_exp1/EVAL/HNO2")
34
+ WORKING_EVAL_SUBWORD = getenv_str("EVAL_SUBWORD", "fake_reasoning")
35
+
36
+ FORBIDDEN_SUBWORDS: List[str] = json.loads(getenv_str("FORBIDDEN_SUBWORDS_JSON", "[]"))
37
+ PARTICULAR = getenv_str("PARTICULAR", "")
38
+
39
+ BASE_PORT = getenv_int("BASE_PORT", 8002)
40
+
41
+ # Prefer explicit URL->ckpt mapping from RUNME.sh
42
+ MODELS_JSON_ENV = getenv_str("MODELS_JSON", "").strip()
43
+ if MODELS_JSON_ENV:
44
+ MODELS: Dict[str, int] = json.loads(MODELS_JSON_ENV)
45
+ MODELS = {str(k): int(v) for k, v in MODELS.items()}
46
+ else:
47
+ # Fallback sequential mapping (rarely used now)
48
+ checkpoints = json.loads(getenv_str("CKPTS_JSON", "[1000]"))
49
+ MODELS = {f"http://localhost:{BASE_PORT + i}/v1/chat/completions": int(checkpoints[i])
50
+ for i in range(len(checkpoints))}
51
+
52
+ MAX_WORKERS = min(16, max(1, len(MODELS)))
53
+
54
+
55
+ def thought_generator_with_local_LLM_requests(
56
+ message,
57
+ LLM_model,
58
+ LLM_max_new_tokens=128,
59
+ n=1,
60
+ API_URL="http://localhost:8000/v1/chat/completions",
61
+ timeout_sec=600,
62
+ stream=False,
63
+ ) -> str | list[Any] | Any:
64
+ # Your eval uses stream=False; keep it simple.
65
+ payload = {
66
+ "model": LLM_model,
67
+ "messages": message,
68
+ "n": n,
69
+ "max_tokens": LLM_max_new_tokens,
70
+ }
71
+
72
+ r = requests.post(
73
+ API_URL,
74
+ json=payload,
75
+ headers={"Content-Type": "application/json", "Authorization": "Bearer 0"},
76
+ timeout=timeout_sec,
77
+ )
78
+
79
+ if r.status_code != 200:
80
+ logger.error(f"LLM API error {r.status_code}: {r.text}")
81
+ raise RuntimeError(f"LLM API returned {r.status_code}")
82
+
83
+ data = r.json()
84
+ if n == 1:
85
+ return data["choices"][0]["message"]["content"]
86
+ return [c["message"]["content"] for c in data["choices"]]
87
+
88
+
89
+ def extract_label(response: str) -> str:
90
+ has_yes = "Yes" in response
91
+ has_no = "No" in response
92
+ if has_yes and not has_no:
93
+ return "Yes"
94
+ if has_no and not has_yes:
95
+ return "No"
96
+ return ""
97
+
98
+
99
+ def call_one_model(
100
+ model_url: str,
101
+ ckpt: int,
102
+ msgs,
103
+ gold_label: str,
104
+ ) -> Tuple[int, Dict[str, Any]]:
105
+ try:
106
+ response = thought_generator_with_local_LLM_requests(
107
+ message=msgs,
108
+ LLM_model="custom-model",
109
+ LLM_max_new_tokens=128,
110
+ n=1,
111
+ API_URL=model_url,
112
+ timeout_sec=300,
113
+ stream=False,
114
+ )
115
+ except Exception as e:
116
+ logger.error(f"Error getting response from model at {model_url}: {e}")
117
+ response = ""
118
+
119
+ label = extract_label(response)
120
+ return ckpt, {
121
+ "label": label,
122
+ "output": response,
123
+ "full_output": response,
124
+ "accuracy": 1 if label == gold_label else 0,
125
+ }
126
+
127
+
128
+ def entry_uid(system: str, prompt: str, gold_label: str, gold_output: str) -> str:
129
+ payload = {"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output}
130
+ s = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
131
+ return hashlib.sha1(s.encode("utf-8")).hexdigest()
132
+
133
+
134
+ def load_cache(path: str) -> Dict[str, Dict[str, Any]]:
135
+ if not os.path.exists(path):
136
+ return {}
137
+ try:
138
+ with open(path, "r") as f:
139
+ data = json.load(f)
140
+ cache = {}
141
+ for e in data:
142
+ uid = entry_uid(e.get("system", ""), e.get("prompt", ""), e.get("gold_label", ""), e.get("gold_output", ""))
143
+ cache[uid] = e
144
+ logger.info(f"Loaded cache from {path}: {len(cache)} entries")
145
+ return cache
146
+ except Exception as ex:
147
+ logger.warning(f"Failed to load cache from {path} (starting fresh): {ex}")
148
+ return {}
149
+
150
+
151
+ def should_run_step(o_entry: Dict[str, Any], ckpt: int) -> bool:
152
+ key = f"step_{ckpt}"
153
+ if key not in o_entry:
154
+ return True
155
+ v = o_entry.get(key) or {}
156
+ out = v.get("output", "")
157
+ return not isinstance(out, str) or out.strip() == ""
158
+
159
+
160
+ def atomic_write_json(path: str, obj: Any) -> None:
161
+ tmp = path + ".tmp"
162
+ with open(tmp, "w") as f:
163
+ json.dump(obj, f, indent=2, ensure_ascii=False)
164
+ os.replace(tmp, path)
165
+
166
+
167
+ def should_process_file(filename: str) -> bool:
168
+ if WORKING_EVAL_SUBWORD and WORKING_EVAL_SUBWORD not in filename:
169
+ return False
170
+ if any(sub in filename for sub in FORBIDDEN_SUBWORDS):
171
+ return False
172
+ if PARTICULAR and PARTICULAR not in filename:
173
+ return False
174
+ return filename.endswith(".json")
175
+
176
+
177
+ if __name__ == "__main__":
178
+ logger.info(f"WORKING_DIR={WORKING_DIR}")
179
+ logger.info(f"SAVE_DIR={SAVE_DIR}")
180
+ logger.info(f"MODELS={MODELS}")
181
+ logger.info(f"MAX_WORKERS={MAX_WORKERS}")
182
+
183
+ if not MODELS:
184
+ print("No models to evaluate (MODELS is empty). Exiting.")
185
+ raise SystemExit(0)
186
+
187
+ os.makedirs(SAVE_DIR, exist_ok=True)
188
+
189
+ for original_eval_log_file in os.listdir(WORKING_DIR):
190
+ if not should_process_file(original_eval_log_file):
191
+ continue
192
+ print(f"Working in {original_eval_log_file}")
193
+
194
+ original_eval_file = os.path.join(WORKING_DIR, original_eval_log_file)
195
+ output_eval_file = os.path.join(SAVE_DIR, original_eval_log_file.replace(".json", "_results.json"))
196
+
197
+ with open(original_eval_file, "r") as f:
198
+ eval_data: list[dict] = json.load(f)
199
+
200
+ cache_map = load_cache(output_eval_file)
201
+ output_eval_data = []
202
+
203
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
204
+ for idx, entry in enumerate(tqdm(eval_data)):
205
+ system = entry["system"]
206
+ prompt = entry["prompt"]
207
+ gold_label = entry["gold_label"]
208
+ gold_output = entry["gold_output"]
209
+
210
+ uid = entry_uid(system, prompt, gold_label, gold_output)
211
+ o_entry = cache_map.get(uid, {})
212
+ o_entry.update({"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output})
213
+
214
+ msgs = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]
215
+
216
+ futures = []
217
+ for model_url, ckpt in MODELS.items():
218
+ if should_run_step(o_entry, ckpt):
219
+ futures.append(executor.submit(call_one_model, model_url, ckpt, msgs, gold_label))
220
+
221
+ for fut in as_completed(futures):
222
+ ckpt, result = fut.result()
223
+ o_entry[f"step_{ckpt}"] = result
224
+
225
+ output_eval_data.append(o_entry)
226
+
227
+ if (idx + 1) % 50 == 0:
228
+ atomic_write_json(output_eval_file, output_eval_data)
229
+
230
+ atomic_write_json(output_eval_file, output_eval_data)
231
+
232
+ print("Evaluation with checkpoints completed.")
F/trainer_log.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
G/checkpoint-10000/adapter_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/workspace/meta-llama/Llama-3.1-8B-Instruct",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": [
22
+ "lm_head",
23
+ "embed_tokens"
24
+ ],
25
+ "peft_type": "LORA",
26
+ "qalora_group_size": 16,
27
+ "r": 8,
28
+ "rank_pattern": {},
29
+ "revision": null,
30
+ "target_modules": [
31
+ "gate_proj",
32
+ "k_proj",
33
+ "o_proj",
34
+ "v_proj",
35
+ "q_proj",
36
+ "up_proj",
37
+ "down_proj"
38
+ ],
39
+ "target_parameters": null,
40
+ "task_type": "CAUSAL_LM",
41
+ "trainable_token_indices": null,
42
+ "use_dora": false,
43
+ "use_qalora": false,
44
+ "use_rslora": false
45
+ }
G/checkpoint-10000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/.ipynb_checkpoints/1k_port8002_gpu0_20251224_032331_batch1-checkpoint.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/10k_port8003_gpu0_20251229_035833_batch3.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/10k_port8003_gpu0_20251229_060759_batch3.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/1k_port8002_gpu0_20251229_035833_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/1k_port8002_gpu0_20251229_035833_batch2.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 3569
G/logs/G/2k_port8003_gpu0_20251229_060759_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/3k_port8004_gpu0_20251229_035833_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 1331
G/logs/G/3k_port8004_gpu0_20251229_060759_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/5k_port8006_gpu0_20251229_035833_batch1.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/5k_port8006_gpu0_20251229_035833_batch1.log.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 2105
G/logs/G/6k_port8003_gpu0_20251229_060759_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/7k_port8004_gpu0_20251229_035833_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/7k_port8004_gpu0_20251229_060759_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/8k_port8005_gpu0_20251229_035833_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/8k_port8005_gpu0_20251229_060759_batch2.log ADDED
The diff for this file is too large to render. See raw diff
 
G/logs/G/9k_port8006_gpu0_20251229_035833_batch2.log ADDED
The diff for this file is too large to render. See raw diff