Linksome commited on Jan 13

Commit

48084a0

verified ·

1 Parent(s): 1598960

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.ipynb_checkpoints/A-checkpoint.yaml +63 -0
.ipynb_checkpoints/D-checkpoint.yaml +63 -0
.ipynb_checkpoints/G-checkpoint.yaml +63 -0
.ipynb_checkpoints/H-checkpoint.yaml +63 -0
.ipynb_checkpoints/I-checkpoint.yaml +63 -0
B/logs/B/10k_port8006_gpu0_20251223_141414_batch2.log +0 -0
B/logs/B/1k_port8002_gpu0_20251223_083422_batch1.log +0 -0
B/logs/B/1k_port8002_gpu0_20251223_083422_batch1.log.pid +1 -0
B/logs/B/1k_port8002_gpu0_20251223_141414_batch1.log +0 -0
B/logs/B/1k_port8002_gpu0_20251223_141414_batch1.log.pid +1 -0
B/logs/B/1k_port8002_gpu0_20251224_034005_batch1.log.pid +1 -0
B/logs/B/1k_port8002_gpu0_20251224_034126_batch1.log +0 -0
B/logs/B/2k_port8003_gpu0_20251223_083422_batch1.log.pid +1 -0
B/logs/B/2k_port8003_gpu0_20251223_141414_batch1.log +0 -0
B/logs/B/2k_port8003_gpu0_20251223_141414_batch1.log.pid +1 -0
B/logs/B/3k_port8004_gpu0_20251223_083422_batch1.log +0 -0
B/logs/B/3k_port8004_gpu0_20251223_083422_batch1.log.pid +1 -0
B/logs/B/3k_port8004_gpu0_20251223_141414_batch1.log +0 -0
B/logs/B/3k_port8004_gpu0_20251223_141414_batch1.log.pid +1 -0
B/logs/B/3k_port8004_gpu0_20251224_034126_batch1.log +0 -0
B/logs/B/4k_port8005_gpu0_20251223_083422_batch1.log.pid +1 -0
B/logs/B/4k_port8005_gpu0_20251223_141414_batch1.log +0 -0
B/logs/B/4k_port8005_gpu0_20251223_141414_batch1.log.pid +1 -0
B/logs/B/5k_port8006_gpu0_20251223_083422_batch1.log.pid +1 -0
B/logs/B/5k_port8006_gpu0_20251223_141414_batch1.log +0 -0
B/logs/B/5k_port8006_gpu0_20251223_141414_batch1.log.pid +1 -0
B/logs/B/6k_port8002_gpu0_20251223_141414_batch2.log +0 -0
B/logs/B/6k_port8002_gpu0_20251223_141414_batch2.log.pid +1 -0
B/logs/B/7k_port8003_gpu0_20251223_141414_batch2.log +0 -0
B/logs/B/7k_port8003_gpu0_20251223_141414_batch2.log.pid +1 -0
B/logs/B/8k_port8004_gpu0_20251223_141414_batch2.log +0 -0
B/logs/B/8k_port8004_gpu0_20251223_141414_batch2.log.pid +1 -0
B/logs/B/9k_port8005_gpu0_20251223_141414_batch2.log.pid +1 -0
C/.ipynb_checkpoints/RUNME-checkpoint.sh +386 -0
C/.ipynb_checkpoints/runC-checkpoint.py +232 -0
C/.ipynb_checkpoints/trainer_log-checkpoint.jsonl +0 -0
C/logs/C/10k_port8006_gpu0_20251229_035809_batch2.log +0 -0
C/logs/C/10k_port8006_gpu0_20251229_035809_batch2.log.pid +1 -0
C/logs/C/10k_port8006_gpu0_20251229_060615_batch2.log +0 -0
C/logs/C/1k_port8002_gpu0_20251229_060615_batch1.log.pid +1 -0
C/logs/C/2k_port8003_gpu0_20251229_060615_batch1.log.pid +1 -0
C/logs/C/3k_port8004_gpu0_20251229_060615_batch1.log +0 -0
C/logs/C/4k_port8005_gpu0_20251229_060615_batch1.log +0 -0
C/logs/C/5k_port8006_gpu0_20251229_060615_batch1.log.pid +1 -0
C/logs/C/7k_port8003_gpu0_20251229_035809_batch2.log +0 -0
C/logs/C/7k_port8003_gpu0_20251229_035809_batch2.log.pid +1 -0
C/logs/C/8k_port8004_gpu0_20251229_035809_batch2.log +0 -0
C/logs/C/9k_port8005_gpu0_20251229_035809_batch2.log +0 -0
C/logs/C/9k_port8005_gpu0_20251229_035809_batch2.log.pid +1 -0
C/logs/C/9k_port8005_gpu0_20251229_060615_batch2.log +0 -0

.ipynb_checkpoints/A-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+bf16: true
+cutoff_len: 128
+dataset: HNO3_train_wo_reasoning
+# dataset: HNO3_train
+# dataset: HNO3_train_fake_reasoning
+# eval_dataset:
+dataset_dir: /workspace/LLaMA-Factory/data
+ddp_timeout: 180000000
+# deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+do_train: true
+do_eval: false
+enable_thinking: false
+# eval_steps: 100
+# eval_strategy: steps
+finetuning_type: lora
+lora_alpha: 16
+lora_rank: 8
+lora_dropout: 0.05
+lora_target: all
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 5e-5
+logging_steps: 1
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 2
+max_samples: 100000000
+model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
+num_train_epochs: 100000000
+optim: adamw_torch
+output_dir: /workspace/v121rc_exp1/A
+packing: false
+# per_device_eval_batch_size: 64
+per_device_train_batch_size: 64
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: wandb
+save_steps: 1000
+stage: sft
+template: llama3
+trust_remote_code: true
+#val_size: 0.5
+warmup_steps: 10
+resize_vocab: true
+weight_decay: 1
+adam_beta1: 0.9
+adam_beta2: 0.98
+# eval_on_each_dataset: true
+# compute_accuracy: true
+# accuracy_at_last_token: true
+# accuracy_with_generate: true
+# predict_with_generate: true
+# do_sample: false
+# temperature: 0.0
+# top_p: 1.0
+# max_new_tokens: 1024
+# group_by_length: false
+# add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>

.ipynb_checkpoints/D-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+bf16: true
+cutoff_len: 128
+dataset: HNO2_train_wo_reasoning
+# dataset: HNO2_train
+# dataset: HNO2_train_fake_reasoning
+# eval_dataset:
+dataset_dir: /workspace/LLaMA-Factory/data
+ddp_timeout: 180000000
+# deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+do_train: true
+do_eval: false
+enable_thinking: false
+# eval_steps: 100
+# eval_strategy: steps
+finetuning_type: lora
+lora_alpha: 16
+lora_rank: 8
+lora_dropout: 0.05
+lora_target: all
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 5e-5
+logging_steps: 1
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 2
+max_samples: 100000000
+model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
+num_train_epochs: 100000000
+optim: adamw_torch
+output_dir: /workspace/v121rc_exp1/D
+packing: false
+# per_device_eval_batch_size: 64
+per_device_train_batch_size: 64
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: wandb
+save_steps: 1000
+stage: sft
+template: llama3
+trust_remote_code: true
+#val_size: 0.5
+warmup_steps: 10
+resize_vocab: true
+weight_decay: 1
+adam_beta1: 0.9
+adam_beta2: 0.98
+# eval_on_each_dataset: true
+# compute_accuracy: true
+# accuracy_at_last_token: true
+# accuracy_with_generate: true
+# predict_with_generate: true
+# do_sample: false
+# temperature: 0.0
+# top_p: 1.0
+# max_new_tokens: 1024
+# group_by_length: false
+# add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>

.ipynb_checkpoints/G-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+bf16: true
+cutoff_len: 128
+dataset: HNO1_train_wo_reasoning
+# dataset: HNO1_train
+# dataset: HNO1_train_fake_reasoning
+# eval_dataset:
+dataset_dir: /workspace/LLaMA-Factory/data
+ddp_timeout: 180000000
+# deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+do_train: true
+do_eval: false
+enable_thinking: false
+# eval_steps: 100
+# eval_strategy: steps
+finetuning_type: lora
+lora_alpha: 16
+lora_rank: 8
+lora_dropout: 0.05
+lora_target: all
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 5e-5
+logging_steps: 1
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 2
+max_samples: 100000000
+model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
+num_train_epochs: 100000000
+optim: adamw_torch
+output_dir: /workspace/v121rc_exp1/G
+packing: false
+# per_device_eval_batch_size: 64
+per_device_train_batch_size: 64
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: wandb
+save_steps: 1000
+stage: sft
+template: llama3
+trust_remote_code: true
+#val_size: 0.5
+warmup_steps: 10
+resize_vocab: true
+weight_decay: 1
+adam_beta1: 0.9
+adam_beta2: 0.98
+# eval_on_each_dataset: true
+# compute_accuracy: true
+# accuracy_at_last_token: true
+# accuracy_with_generate: true
+# predict_with_generate: true
+# do_sample: false
+# temperature: 0.0
+# top_p: 1.0
+# max_new_tokens: 1024
+# group_by_length: false
+# add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>

.ipynb_checkpoints/H-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+bf16: true
+cutoff_len: 128
+# dataset: HNO1_train_wo_reasoning
+dataset: HNO1_train
+# dataset: HNO1_train_fake_reasoning
+# eval_dataset:
+dataset_dir: /workspace/LLaMA-Factory/data
+ddp_timeout: 180000000
+# deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+do_train: true
+do_eval: false
+enable_thinking: false
+# eval_steps: 100
+# eval_strategy: steps
+finetuning_type: lora
+lora_alpha: 16
+lora_rank: 8
+lora_dropout: 0.05
+lora_target: all
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 5e-5
+logging_steps: 1
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 2
+max_samples: 100000000
+model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
+num_train_epochs: 100000000
+optim: adamw_torch
+output_dir: /workspace/v121rc_exp1/H
+packing: false
+# per_device_eval_batch_size: 64
+per_device_train_batch_size: 64
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: wandb
+save_steps: 1000
+stage: sft
+template: llama3
+trust_remote_code: true
+#val_size: 0.5
+warmup_steps: 10
+resize_vocab: true
+weight_decay: 1
+adam_beta1: 0.9
+adam_beta2: 0.98
+# eval_on_each_dataset: true
+# compute_accuracy: true
+# accuracy_at_last_token: true
+# accuracy_with_generate: true
+# predict_with_generate: true
+# do_sample: false
+# temperature: 0.0
+# top_p: 1.0
+# max_new_tokens: 1024
+# group_by_length: false
+# add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>

.ipynb_checkpoints/I-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+bf16: true
+cutoff_len: 128
+# dataset: HNO1_train_wo_reasoning
+# dataset: HNO1_train
+dataset: HNO1_train_fake_reasoning
+# eval_dataset:
+dataset_dir: /workspace/LLaMA-Factory/data
+ddp_timeout: 180000000
+# deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+do_train: true
+do_eval: false
+enable_thinking: false
+# eval_steps: 100
+# eval_strategy: steps
+finetuning_type: lora
+lora_alpha: 16
+lora_rank: 8
+lora_dropout: 0.05
+lora_target: all
+flash_attn: auto
+gradient_accumulation_steps: 1
+include_num_input_tokens_seen: true
+learning_rate: 5e-5
+logging_steps: 1
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 2
+max_samples: 100000000
+model_name_or_path: /workspace/meta-llama/Llama-3.1-8B-Instruct
+num_train_epochs: 100000000
+optim: adamw_torch
+output_dir: /workspace/v121rc_exp1/I
+packing: false
+# per_device_eval_batch_size: 64
+per_device_train_batch_size: 64
+plot_loss: true
+preprocessing_num_workers: 16
+report_to: wandb
+save_steps: 1000
+stage: sft
+template: llama3
+trust_remote_code: true
+#val_size: 0.5
+warmup_steps: 10
+resize_vocab: true
+weight_decay: 1
+adam_beta1: 0.9
+adam_beta2: 0.98
+# eval_on_each_dataset: true
+# compute_accuracy: true
+# accuracy_at_last_token: true
+# accuracy_with_generate: true
+# predict_with_generate: true
+# do_sample: false
+# temperature: 0.0
+# top_p: 1.0
+# max_new_tokens: 1024
+# group_by_length: false
+# add_tokens: <MILLFIELD>,<Yes>,<No>,<think>,</think>

B/logs/B/10k_port8006_gpu0_20251223_141414_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/1k_port8002_gpu0_20251223_083422_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/1k_port8002_gpu0_20251223_083422_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 42535

B/logs/B/1k_port8002_gpu0_20251223_141414_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/1k_port8002_gpu0_20251223_141414_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 260

B/logs/B/1k_port8002_gpu0_20251224_034005_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 9333

B/logs/B/1k_port8002_gpu0_20251224_034126_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/2k_port8003_gpu0_20251223_083422_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 43563

B/logs/B/2k_port8003_gpu0_20251223_141414_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/2k_port8003_gpu0_20251223_141414_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1131

B/logs/B/3k_port8004_gpu0_20251223_083422_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/3k_port8004_gpu0_20251223_083422_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 44181

B/logs/B/3k_port8004_gpu0_20251223_141414_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/3k_port8004_gpu0_20251223_141414_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1530

B/logs/B/3k_port8004_gpu0_20251224_034126_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/4k_port8005_gpu0_20251223_083422_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 44677

B/logs/B/4k_port8005_gpu0_20251223_141414_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/4k_port8005_gpu0_20251223_141414_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1935

B/logs/B/5k_port8006_gpu0_20251223_083422_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 45300

B/logs/B/5k_port8006_gpu0_20251223_141414_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/5k_port8006_gpu0_20251223_141414_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 2340

B/logs/B/6k_port8002_gpu0_20251223_141414_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/6k_port8002_gpu0_20251223_141414_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 9295

B/logs/B/7k_port8003_gpu0_20251223_141414_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/7k_port8003_gpu0_20251223_141414_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 9692

B/logs/B/8k_port8004_gpu0_20251223_141414_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

B/logs/B/8k_port8004_gpu0_20251223_141414_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 10083

B/logs/B/9k_port8005_gpu0_20251223_141414_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 10454

C/.ipynb_checkpoints/RUNME-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,386 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# -----------------------------
+# User config
+# -----------------------------
+config="C"
+CONFIG_DIR="/workspace/v121rc_exp1/${config}"
+# YAML generation defaults
+MODEL_NAME_OR_PATH="/workspace/meta-llama/Llama-3.1-8B-Instruct"
+TEMPLATE="llama3"
+FINETUNING_TYPE="lora"
+INFER_BACKEND="huggingface"
+TRUST_REMOTE_CODE="true"
+# Launch config
+BASE_PORT=8002
+SLEEP_BETWEEN_LAUNCHES_SEC=10
+VRAM_THRESHOLD_PCT=80    # if GPU >= threshold after launch, try next GPU for next ckpt
+BATCH_MIN_MODELS=1       # start eval once at least this many services are up
+# Eval config (passed to python)
+PYTHON_EVAL="/workspace/v121rc_exp1/C/runC.py"
+EVAL_WORKING_DIR="/workspace/v121rc_exp1/PandaEval12_2/HNO3"
+EVAL_SUBWORD="fake_reasoning"
+FORBIDDEN_SUBWORDS_JSON="[]"
+PARTICULAR=""
+SAVE_DIR="${CONFIG_DIR}"
+# Always stop services between batches to free VRAM
+STOP_SERVICES_BETWEEN_BATCHES="true"
+# -----------------------------
+# Setup logging
+# -----------------------------
+LOG_ROOT="${CONFIG_DIR}/logs"
+mkdir -p "${LOG_ROOT}/${config}"
+timestamp=$(date +"%Y%m%d_%H%M%S")
+# -----------------------------
+# Helpers
+# -----------------------------
+require_cmd() {
+  command -v "$1" >/dev/null 2>&1 || { echo "ERROR: missing command: $1" >&2; exit 1; }
+}
+require_cmd nvidia-smi
+require_cmd python
+require_cmd curl
+require_cmd sort
+require_cmd awk
+num_gpus() {
+  nvidia-smi -L | wc -l | awk '{print $1}'
+}
+gpu_mem_pct() {
+  local gpu="$1"
+  nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits -i "${gpu}" \
+    | awk -F',' '{used=$1; total=$2; if (total==0) {print 100} else {printf("%d\n", (used/total)*100)} }'
+}
+launch_service () {
+  local gpu="$1"
+  local api_port="$2"
+  local yaml_path="$3"
+  local log_file="$4"
+  local pid_file="$5"
+  echo "Starting (GPU ${gpu}) port ${api_port} : ${yaml_path}"
+  echo "Log: ${log_file}"
+  API_PORT="${api_port}" CUDA_VISIBLE_DEVICES="${gpu}" \
+    llamafactory-cli api "${yaml_path}" \
+    > "${log_file}" 2>&1 &
+  echo $! > "${pid_file}"
+}
+wait_for_endpoint () {
+  local port="$1"
+  local url="http://localhost:${port}/v1/models"
+  for attempt in {1..120}; do
+    if curl -sS -m 2 "${url}" >/dev/null 2>&1; then
+      echo "  ready: ${url}"
+      return 0
+    fi
+    sleep 2
+  done
+  echo "ERROR: Endpoint did not become ready: ${url}" >&2
+  return 1
+}
+stop_batch_services () {
+  local pidfiles=("$@")
+  echo "Stopping batch services: ${#pidfiles[@]} processes"
+  for pf in "${pidfiles[@]}"; do
+    [[ -f "${pf}" ]] || continue
+    pid="$(cat "${pf}" || true)"
+    if [[ -n "${pid}" ]] && kill -0 "${pid}" >/dev/null 2>&1; then
+      kill "${pid}" || true
+    fi
+  done
+}
+# -----------------------------
+# Discover checkpoints
+# -----------------------------
+discover_checkpoints_json () {
+  shopt -s nullglob
+  local ckpt_dirs=( "${CONFIG_DIR}"/checkpoint-* )
+  if (( ${#ckpt_dirs[@]} == 0 )); then
+    echo "ERROR: No checkpoint-* folders found under: ${CONFIG_DIR}" >&2
+    exit 1
+  fi
+  mapfile -t ckpt_dirs < <(printf "%s\n" "${ckpt_dirs[@]}" | sort -V)
+  local ckpts=()
+  for ckpt_dir in "${ckpt_dirs[@]}"; do
+    local base step
+    base="$(basename "${ckpt_dir}")"
+    step="${base#checkpoint-}"
+    if [[ "${step}" =~ ^[0-9]+$ ]]; then
+      ckpts+=( "${step}" )
+    fi
+  done
+  local json="["
+  for i in "${!ckpts[@]}"; do
+    (( i>0 )) && json+=", "
+    json+="${ckpts[$i]}"
+  done
+  json+="]"
+  echo "${json}"
+}
+# -----------------------------
+# Compute which checkpoints still need launching (resume-aware)
+# -----------------------------
+compute_needed_checkpoints_json () {
+  local all_ckpts_json="$1"
+  python - "${all_ckpts_json}" <<'PY'
+import os, json, sys
+CONFIG_DIR = os.environ.get("CONFIG_DIR")
+SAVE_DIR = os.environ.get("SAVE_DIR", CONFIG_DIR)
+WORKING_DIR = os.environ.get("EVAL_WORKING_DIR")
+SUBWORD = os.environ.get("EVAL_SUBWORD", "")
+FORBIDDEN = json.loads(os.environ.get("FORBIDDEN_SUBWORDS_JSON", "[]"))
+PARTICULAR = os.environ.get("PARTICULAR", "")
+all_ckpts = json.loads(sys.argv[1])
+def should_process(fn: str) -> bool:
+    if SUBWORD and SUBWORD not in fn:
+        return False
+    if any(s in fn for s in FORBIDDEN):
+        return False
+    if PARTICULAR and PARTICULAR not in fn:
+        return False
+    return fn.endswith(".json")
+eval_files = sorted([fn for fn in os.listdir(WORKING_DIR) if should_process(fn)])
+if not eval_files:
+    print(json.dumps(all_ckpts))
+    raise SystemExit(0)
+def file_complete_for_ckpt(eval_file: str, ckpt: int) -> bool:
+    in_path = os.path.join(WORKING_DIR, eval_file)
+    out_path = os.path.join(SAVE_DIR, eval_file.replace(".json", "_results.json"))
+    if not os.path.exists(out_path):
+        return False
+    try:
+        with open(in_path, "r") as f:
+            in_data = json.load(f)
+        with open(out_path, "r") as f:
+            out_data = json.load(f)
+    except Exception:
+        return False
+    if not isinstance(in_data, list) or not isinstance(out_data, list):
+        return False
+    if len(out_data) != len(in_data):
+        return False
+    key = f"step_{ckpt}"
+    for e in out_data:
+        v = e.get(key) or {}
+        out = v.get("output", "")
+        if not isinstance(out, str) or out.strip() == "":
+            return False
+    return True
+needed = []
+for ckpt in all_ckpts:
+    done_everywhere = True
+    for ef in eval_files:
+        if not file_complete_for_ckpt(ef, ckpt):
+            done_everywhere = False
+            break
+    if not done_everywhere:
+        needed.append(ckpt)
+print(json.dumps(needed))
+PY
+}
+# -----------------------------
+# Generate YAML for one checkpoint
+# -----------------------------
+write_yaml_for_ckpt () {
+  local step="$1"
+  python - "${step}" <<'PY'
+import os, sys
+step = int(sys.argv[1])
+CONFIG_DIR = os.environ["CONFIG_DIR"]
+MODEL = os.environ["MODEL_NAME_OR_PATH"]
+TEMPLATE = os.environ["TEMPLATE"]
+FINETUNING_TYPE = os.environ["FINETUNING_TYPE"]
+INFER_BACKEND = os.environ["INFER_BACKEND"]
+TRUST_REMOTE_CODE = os.environ["TRUST_REMOTE_CODE"]
+ckpt_dir = os.path.join(CONFIG_DIR, f"checkpoint-{step}")
+if not os.path.isdir(ckpt_dir):
+    raise SystemExit(f"Missing checkpoint dir: {ckpt_dir}")
+name = f"{step//1000}k" if step % 1000 == 0 else str(step)
+yaml_path = os.path.join(CONFIG_DIR, f"{name}.yaml")
+with open(yaml_path, "w") as f:
+    f.write(
+        f"model_name_or_path: {MODEL}\n"
+        f"adapter_name_or_path: {ckpt_dir}\n"
+        f"template: {TEMPLATE}\n"
+        f"finetuning_type: {FINETUNING_TYPE}\n"
+        f"infer_backend: {INFER_BACKEND}\n"
+        f"trust_remote_code: {TRUST_REMOTE_CODE}\n"
+    )
+print(yaml_path)
+PY
+}
+# -----------------------------
+# Main (batch loop)
+# -----------------------------
+export CONFIG_DIR
+export SAVE_DIR
+export EVAL_WORKING_DIR
+export EVAL_SUBWORD
+export FORBIDDEN_SUBWORDS_JSON
+export PARTICULAR
+export MODEL_NAME_OR_PATH
+export TEMPLATE
+export FINETUNING_TYPE
+export INFER_BACKEND
+export TRUST_REMOTE_CODE
+ALL_CKPTS_JSON="$(discover_checkpoints_json)"
+GPU_COUNT="$(num_gpus)"
+echo "Detected GPUs: ${GPU_COUNT}"
+echo "All checkpoints found: ${ALL_CKPTS_JSON}"
+batch_idx=0
+while true; do
+  NEEDED_CKPTS_JSON="$(compute_needed_checkpoints_json "${ALL_CKPTS_JSON}")"
+  echo "Still needed checkpoints: ${NEEDED_CKPTS_JSON}"
+  if [[ "${NEEDED_CKPTS_JSON}" == "[]" ]]; then
+    echo "All checkpoints complete across outputs. Done."
+    exit 0
+  fi
+  batch_idx=$((batch_idx + 1))
+  echo "=============================="
+  echo "Batch ${batch_idx}: launching what fits under VRAM threshold (${VRAM_THRESHOLD_PCT}%)"
+  echo "=============================="
+  # Parse needed list into bash array
+  mapfile -t NEEDED_LIST < <(python - "${NEEDED_CKPTS_JSON}" <<'PY'
+import json, sys
+for x in json.loads(sys.argv[1]):
+    print(int(x))
+PY
+)
+  MODELS_JSON="{"
+  first=1
+  launched=0
+  # track launched service pidfiles to stop after batch
+  batch_pidfiles=()
+  port="${BASE_PORT}"
+  gpu=0
+  for ckpt in "${NEEDED_LIST[@]}"; do
+    # Find a GPU with headroom; if none, stop launching more in this batch.
+    found_gpu="false"
+    for ((try=0; try<GPU_COUNT; try++)); do
+      pct="$(gpu_mem_pct "${gpu}")"
+      if (( pct < VRAM_THRESHOLD_PCT )); then
+        found_gpu="true"
+        break
+      fi
+      gpu=$((gpu + 1))
+      if (( gpu >= GPU_COUNT )); then gpu=0; fi
+    done
+    if [[ "${found_gpu}" != "true" ]]; then
+      echo "No GPU under ${VRAM_THRESHOLD_PCT}% VRAM. Stop launching; start eval with current batch."
+      break
+    fi
+    yaml_path="$(write_yaml_for_ckpt "${ckpt}")"
+    tag="$(basename "${yaml_path}" .yaml)"
+    log_file="${LOG_ROOT}/${config}/${tag}_port${port}_gpu${gpu}_${timestamp}_batch${batch_idx}.log"
+    pid_file="${log_file}.pid"
+    launch_service "${gpu}" "${port}" "${yaml_path}" "${log_file}" "${pid_file}"
+    batch_pidfiles+=( "${pid_file}" )
+    if ! wait_for_endpoint "${port}"; then
+      echo "Endpoint failed on port ${port}; stopping batch and exiting."
+      stop_batch_services "${batch_pidfiles[@]}"
+      exit 1
+    fi
+    url="http://localhost:${port}/v1/chat/completions"
+    if (( first == 1 )); then
+      MODELS_JSON+="\"${url}\": ${ckpt}"
+      first=0
+    else
+      MODELS_JSON+=", \"${url}\": ${ckpt}"
+    fi
+    launched=$((launched + 1))
+    pct_after="$(gpu_mem_pct "${gpu}")"
+    echo "GPU ${gpu} VRAM after launch: ${pct_after}%"
+    if (( pct_after >= VRAM_THRESHOLD_PCT )); then
+      gpu=$((gpu + 1))
+      if (( gpu >= GPU_COUNT )); then gpu=0; fi
+    fi
+    port=$((port + 1))
+    echo "Sleeping ${SLEEP_BETWEEN_LAUNCHES_SEC}s to avoid VRAM spikes..."
+    sleep "${SLEEP_BETWEEN_LAUNCHES_SEC}"
+  done
+  MODELS_JSON+="}"
+  echo "Launched models in batch ${batch_idx}: ${launched}"
+  echo "MODELS_JSON=${MODELS_JSON}"
+  if (( launched < BATCH_MIN_MODELS )); then
+    echo "ERROR: Could not launch even ${BATCH_MIN_MODELS} model(s) under VRAM threshold."
+    echo "Either increase VRAM_THRESHOLD_PCT, reduce model size, or free VRAM."
+    exit 1
+  fi
+  # Run eval for this batch
+  export MODELS_JSON
+  export CKPTS_JSON="[]"        # unused when MODELS_JSON exists, but keep it defined
+  export BASE_PORT="${BASE_PORT}"
+  echo "Running eval for batch ${batch_idx}: python ${PYTHON_EVAL}"
+  python "${PYTHON_EVAL}"
+  # Stop services to free VRAM for next batch
+  if [[ "${STOP_SERVICES_BETWEEN_BATCHES}" == "true" ]]; then
+    stop_batch_services "${batch_pidfiles[@]}"
+    echo "Batch ${batch_idx} services stopped."
+    # give GPU a moment to release memory
+    sleep 5
+  else
+    echo "Leaving batch services running (not recommended for batch mode)."
+    echo "This may prevent future batches from launching due to VRAM saturation."
+  fi
+done

C/.ipynb_checkpoints/runC-checkpoint.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import json
+import os
+import hashlib
+from typing import Any, Dict, Tuple, List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+import requests
+from loguru import logger
+def getenv_str(key: str, default: str) -> str:
+    v = os.environ.get(key)
+    return default if v is None else v
+def getenv_int(key: str, default: int) -> int:
+    v = os.environ.get(key)
+    if v is None or v.strip() == "":
+        return default
+    try:
+        return int(v)
+    except ValueError:
+        raise ValueError(f"Env var {key} must be int, got: {v!r}")
+# ----------------------------
+# Read config from environment
+# ----------------------------
+CONFIG_DIR = getenv_str("CONFIG_DIR", "/workspace/v121rc_exp1/C")
+SAVE_DIR = getenv_str("SAVE_DIR", CONFIG_DIR)
+WORKING_DIR = getenv_str("EVAL_WORKING_DIR", "/workspace/v121rc_exp1/EVAL/HNO3")
+WORKING_EVAL_SUBWORD = getenv_str("EVAL_SUBWORD", "fake_reasoning")
+FORBIDDEN_SUBWORDS: List[str] = json.loads(getenv_str("FORBIDDEN_SUBWORDS_JSON", "[]"))
+PARTICULAR = getenv_str("PARTICULAR", "")
+BASE_PORT = getenv_int("BASE_PORT", 8002)
+# Prefer explicit URL->ckpt mapping from RUNME.sh
+MODELS_JSON_ENV = getenv_str("MODELS_JSON", "").strip()
+if MODELS_JSON_ENV:
+    MODELS: Dict[str, int] = json.loads(MODELS_JSON_ENV)
+    MODELS = {str(k): int(v) for k, v in MODELS.items()}
+else:
+    # Fallback sequential mapping (rarely used now)
+    checkpoints = json.loads(getenv_str("CKPTS_JSON", "[1000]"))
+    MODELS = {f"http://localhost:{BASE_PORT + i}/v1/chat/completions": int(checkpoints[i])
+              for i in range(len(checkpoints))}
+MAX_WORKERS = min(16, max(1, len(MODELS)))
+def thought_generator_with_local_LLM_requests(
+    message,
+    LLM_model,
+    LLM_max_new_tokens=128,
+    n=1,
+    API_URL="http://localhost:8000/v1/chat/completions",
+    timeout_sec=600,
+    stream=False,
+) -> str | list[Any] | Any:
+    # Your eval uses stream=False; keep it simple.
+    payload = {
+        "model": LLM_model,
+        "messages": message,
+        "n": n,
+        "max_tokens": LLM_max_new_tokens,
+    }
+    r = requests.post(
+        API_URL,
+        json=payload,
+        headers={"Content-Type": "application/json", "Authorization": "Bearer 0"},
+        timeout=timeout_sec,
+    )
+    if r.status_code != 200:
+        logger.error(f"LLM API error {r.status_code}: {r.text}")
+        raise RuntimeError(f"LLM API returned {r.status_code}")
+    data = r.json()
+    if n == 1:
+        return data["choices"][0]["message"]["content"]
+    return [c["message"]["content"] for c in data["choices"]]
+def extract_label(response: str) -> str:
+    has_yes = "Yes" in response
+    has_no = "No" in response
+    if has_yes and not has_no:
+        return "Yes"
+    if has_no and not has_yes:
+        return "No"
+    return ""
+def call_one_model(
+    model_url: str,
+    ckpt: int,
+    msgs,
+    gold_label: str,
+) -> Tuple[int, Dict[str, Any]]:
+    try:
+        response = thought_generator_with_local_LLM_requests(
+            message=msgs,
+            LLM_model="custom-model",
+            LLM_max_new_tokens=128,
+            n=1,
+            API_URL=model_url,
+            timeout_sec=300,
+            stream=False,
+        )
+    except Exception as e:
+        logger.error(f"Error getting response from model at {model_url}: {e}")
+        response = ""
+    label = extract_label(response)
+    return ckpt, {
+        "label": label,
+        "output": response,
+        "full_output": response,
+        "accuracy": 1 if label == gold_label else 0,
+    }
+def entry_uid(system: str, prompt: str, gold_label: str, gold_output: str) -> str:
+    payload = {"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output}
+    s = json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha1(s.encode("utf-8")).hexdigest()
+def load_cache(path: str) -> Dict[str, Dict[str, Any]]:
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, "r") as f:
+            data = json.load(f)
+        cache = {}
+        for e in data:
+            uid = entry_uid(e.get("system", ""), e.get("prompt", ""), e.get("gold_label", ""), e.get("gold_output", ""))
+            cache[uid] = e
+        logger.info(f"Loaded cache from {path}: {len(cache)} entries")
+        return cache
+    except Exception as ex:
+        logger.warning(f"Failed to load cache from {path} (starting fresh): {ex}")
+        return {}
+def should_run_step(o_entry: Dict[str, Any], ckpt: int) -> bool:
+    key = f"step_{ckpt}"
+    if key not in o_entry:
+        return True
+    v = o_entry.get(key) or {}
+    out = v.get("output", "")
+    return not isinstance(out, str) or out.strip() == ""
+def atomic_write_json(path: str, obj: Any) -> None:
+    tmp = path + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(obj, f, indent=2, ensure_ascii=False)
+    os.replace(tmp, path)
+def should_process_file(filename: str) -> bool:
+    if WORKING_EVAL_SUBWORD and WORKING_EVAL_SUBWORD not in filename:
+        return False
+    if any(sub in filename for sub in FORBIDDEN_SUBWORDS):
+        return False
+    if PARTICULAR and PARTICULAR not in filename:
+        return False
+    return filename.endswith(".json")
+if __name__ == "__main__":
+    logger.info(f"WORKING_DIR={WORKING_DIR}")
+    logger.info(f"SAVE_DIR={SAVE_DIR}")
+    logger.info(f"MODELS={MODELS}")
+    logger.info(f"MAX_WORKERS={MAX_WORKERS}")
+    if not MODELS:
+        print("No models to evaluate (MODELS is empty). Exiting.")
+        raise SystemExit(0)
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    for original_eval_log_file in os.listdir(WORKING_DIR):
+        if not should_process_file(original_eval_log_file):
+            continue
+        print(f"Working in {original_eval_log_file}")
+        original_eval_file = os.path.join(WORKING_DIR, original_eval_log_file)
+        output_eval_file = os.path.join(SAVE_DIR, original_eval_log_file.replace(".json", "_results.json"))
+        with open(original_eval_file, "r") as f:
+            eval_data: list[dict] = json.load(f)
+        cache_map = load_cache(output_eval_file)
+        output_eval_data = []
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            for idx, entry in enumerate(tqdm(eval_data)):
+                system = entry["system"]
+                prompt = entry["prompt"]
+                gold_label = entry["gold_label"]
+                gold_output = entry["gold_output"]
+                uid = entry_uid(system, prompt, gold_label, gold_output)
+                o_entry = cache_map.get(uid, {})
+                o_entry.update({"system": system, "prompt": prompt, "gold_label": gold_label, "gold_output": gold_output})
+                msgs = [{"role": "system", "content": system}, {"role": "user", "content": prompt}]
+                futures = []
+                for model_url, ckpt in MODELS.items():
+                    if should_run_step(o_entry, ckpt):
+                        futures.append(executor.submit(call_one_model, model_url, ckpt, msgs, gold_label))
+                for fut in as_completed(futures):
+                    ckpt, result = fut.result()
+                    o_entry[f"step_{ckpt}"] = result
+                output_eval_data.append(o_entry)
+                if (idx + 1) % 50 == 0:
+                    atomic_write_json(output_eval_file, output_eval_data)
+        atomic_write_json(output_eval_file, output_eval_data)
+    print("Evaluation with checkpoints completed.")

C/.ipynb_checkpoints/trainer_log-checkpoint.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/10k_port8006_gpu0_20251229_035809_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/10k_port8006_gpu0_20251229_035809_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 8738

C/logs/C/10k_port8006_gpu0_20251229_060615_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/1k_port8002_gpu0_20251229_060615_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 262

C/logs/C/2k_port8003_gpu0_20251229_060615_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 626

C/logs/C/3k_port8004_gpu0_20251229_060615_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/4k_port8005_gpu0_20251229_060615_batch1.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/5k_port8006_gpu0_20251229_060615_batch1.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1893

C/logs/C/7k_port8003_gpu0_20251229_035809_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/7k_port8003_gpu0_20251229_035809_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 7655

C/logs/C/8k_port8004_gpu0_20251229_035809_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/9k_port8005_gpu0_20251229_035809_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff

C/logs/C/9k_port8005_gpu0_20251229_035809_batch2.log.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 8377

C/logs/C/9k_port8005_gpu0_20251229_060615_batch2.log ADDED Viewed

The diff for this file is too large to render. See raw diff