iboero16 commited on
Commit
0b57d90
·
verified ·
1 Parent(s): 59dedf9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
38
  wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text
 
 
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
38
  wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb filter=lfs diff=lfs merge=lfs -text
adapter_config.json CHANGED
@@ -23,18 +23,18 @@
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
25
  "peft_type": "LORA",
26
- "r": 16,
27
  "rank_pattern": {},
28
  "revision": null,
29
  "target_modules": [
 
 
30
  "down_proj",
31
  "lm_head",
32
- "gate_proj",
33
- "v_proj",
34
- "k_proj",
35
- "up_proj",
36
  "o_proj",
37
- "q_proj"
 
 
38
  ],
39
  "task_type": null,
40
  "trainable_token_indices": null,
 
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
25
  "peft_type": "LORA",
26
+ "r": 4,
27
  "rank_pattern": {},
28
  "revision": null,
29
  "target_modules": [
30
+ "v_proj",
31
+ "gate_proj",
32
  "down_proj",
33
  "lm_head",
 
 
 
 
34
  "o_proj",
35
+ "up_proj",
36
+ "q_proj",
37
+ "k_proj"
38
  ],
39
  "task_type": null,
40
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fbfeadb937f621c6832145b91c7d94404fe8ee099eb497211fb6951fdde1b03
3
- size 1138873304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ceb99c3a3c514adbaf1adaf7b3d1cfecc7c2f0b42aa5a3066b47d9b60c54536
3
+ size 1072781800
arguments.json CHANGED
@@ -2,7 +2,7 @@
2
  "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
3
  "recompute_baseline": false,
4
  "cache_dir": "/home/panda/pda-llm/cache/sft-tools",
5
- "max_length": 2048,
6
  "trust_remote_code": true,
7
  "train_datasets": [
8
  [
@@ -13,10 +13,10 @@
13
  ]
14
  ],
15
  "eval_datasets": null,
16
- "safety_ratio_tol": 100.0,
17
  "important_sft": true,
18
  "resilient_coeff": 1.0,
19
- "epochs": 4,
20
  "per_device_train_batch_size": 1,
21
  "per_device_eval_batch_size": 1,
22
  "gradient_accumulation_steps": 48,
@@ -29,18 +29,18 @@
29
  "fp16": false,
30
  "bf16": true,
31
  "tf32": false,
32
- "lora_r": 16,
33
  "lora_alpha": 32,
34
  "lora_dropout": 0.05,
35
  "eval_strategy": "epoch",
36
  "eval_interval": 1000000,
37
  "need_eval": true,
38
  "eval_split_ratio": null,
39
- "output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-100",
40
  "log_type": "wandb",
41
- "log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-100",
42
  "log_project": "TOOLS-SFT",
43
- "log_run_name": "tools-sft-2025-05-09-04-39-40",
44
  "save_16bit": false,
45
  "save_interval": 1000000,
46
  "local_rank": 0,
@@ -55,6 +55,6 @@
55
  "type": "torch.device",
56
  "repr": "device(type='cuda', index=0)"
57
  },
58
- "num_update_steps_per_epoch": 118,
59
- "total_training_steps": 472
60
  }
 
2
  "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
3
  "recompute_baseline": false,
4
  "cache_dir": "/home/panda/pda-llm/cache/sft-tools",
5
+ "max_length": 4096,
6
  "trust_remote_code": true,
7
  "train_datasets": [
8
  [
 
13
  ]
14
  ],
15
  "eval_datasets": null,
16
+ "safety_ratio_tol": 50.0,
17
  "important_sft": true,
18
  "resilient_coeff": 1.0,
19
+ "epochs": 3,
20
  "per_device_train_batch_size": 1,
21
  "per_device_eval_batch_size": 1,
22
  "gradient_accumulation_steps": 48,
 
29
  "fp16": false,
30
  "bf16": true,
31
  "tf32": false,
32
+ "lora_r": 4,
33
  "lora_alpha": 32,
34
  "lora_dropout": 0.05,
35
  "eval_strategy": "epoch",
36
  "eval_interval": 1000000,
37
  "need_eval": true,
38
  "eval_split_ratio": null,
39
+ "output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
40
  "log_type": "wandb",
41
+ "log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
42
  "log_project": "TOOLS-SFT",
43
+ "log_run_name": "tools-sft-2025-05-11-14-32-03",
44
  "save_16bit": false,
45
  "save_interval": 1000000,
46
  "local_rank": 0,
 
55
  "type": "torch.device",
56
  "repr": "device(type='cuda', index=0)"
57
  },
58
+ "num_update_steps_per_epoch": 112,
59
+ "total_training_steps": 336
60
  }
arguments.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2d1b5c0aceee6d5d6b2fff2845cdd64294c50293b9d08c3ccd0ff23a70879ef
3
- size 1225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d44fd7cf7fb68b61cf52e153fc0789f3d95aa166ebbb56b1832e0b98a1536149
3
+ size 1231
config.json CHANGED
@@ -20,6 +20,21 @@
20
  "num_key_value_heads": 8,
21
  "pad_token_id": 128256,
22
  "pretraining_tp": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "rms_norm_eps": 1e-05,
24
  "rope_scaling": {
25
  "factor": 8.0,
 
20
  "num_key_value_heads": 8,
21
  "pad_token_id": 128256,
22
  "pretraining_tp": 1,
23
+ "quantization_config": {
24
+ "_load_in_4bit": true,
25
+ "_load_in_8bit": false,
26
+ "bnb_4bit_compute_dtype": "bfloat16",
27
+ "bnb_4bit_quant_storage": "uint8",
28
+ "bnb_4bit_quant_type": "nf4",
29
+ "bnb_4bit_use_double_quant": true,
30
+ "llm_int8_enable_fp32_cpu_offload": false,
31
+ "llm_int8_has_fp16_weight": false,
32
+ "llm_int8_skip_modules": null,
33
+ "llm_int8_threshold": 6.0,
34
+ "load_in_4bit": true,
35
+ "load_in_8bit": false,
36
+ "quant_method": "bitsandbytes"
37
+ },
38
  "rms_norm_eps": 1e-05,
39
  "rope_scaling": {
40
  "factor": 8.0,
environ.txt CHANGED
@@ -1,4 +1,5 @@
1
- BROWSER=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/bin/helpers/browser.sh
 
2
  COLORTERM=truecolor
3
  CONDA_DEFAULT_ENV=pda
4
  CONDA_EXE=/home/panda/miniconda3/bin/conda
@@ -6,13 +7,14 @@ CONDA_PREFIX=/home/panda/miniconda3/envs/pda
6
  CONDA_PREFIX_1=/home/panda/miniconda3
7
  CONDA_PROMPT_MODIFIER=(pda)
8
  CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
 
9
  CONDA_SHLVL=2
10
  CROSS_RANK=0
11
  CROSS_SIZE=1
12
  CUDA_MODULE_LOADING=LAZY
13
  CUDA_VISIBLE_DEVICES=0,1
14
  DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
15
- GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/extensions/git/dist/askpass.sh
16
  HOME=/home/panda
17
  LANG=en_US.UTF-8
18
  LOCAL_RANK=0
@@ -20,39 +22,41 @@ LOCAL_SIZE=2
20
  LOGLEVEL=WARNING
21
  LOGNAME=panda
22
  MASTER_ADDR=127.0.0.1
23
- MASTER_PORT=58384
24
  MOTD_SHOWN=pam
25
- OLDPWD=/home/panda
26
- PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/panda/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
27
  PWD=/home/panda/pda-llm/scripts
 
28
  PYTHONHASHSEED=42
29
  PYTHONPATH=/home/panda/pda-llm
30
  RANK=0
31
  SHELL=/bin/sh
32
  SHLVL=4
33
- SSH_CLIENT=10.100.129.163 58321 22
34
- SSH_CONNECTION=10.100.129.163 58321 158.130.110.127 22
35
  SSL_CERT_DIR=/usr/lib/ssl/certs
36
  SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
37
  TERM=screen
38
  TERM_PROGRAM=vscode
39
- TERM_PROGRAM_VERSION=1.99.2
40
- TMUX=/tmp//tmux-1008/default,750869,0
41
  TMUX_PANE=%0
42
  USER=panda
 
43
  VSCODE_GIT_ASKPASS_EXTRA_ARGS=
44
- VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/extensions/git/dist/askpass-main.js
45
- VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/node
46
  VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
47
- VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-6dd97596-0fde-4c4b-bbcb-61896b21f983.sock
48
  WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
49
  WANDB_ENTITY=alelab
50
  WANDB_MODE=online
51
- WANDB_SERVICE=2-1843342-tcp-localhost-45547
52
  WORLD_SIZE=2
53
  XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
54
  XDG_RUNTIME_DIR=/run/user/1008
55
  XDG_SESSION_CLASS=user
56
- XDG_SESSION_ID=1945
57
  XDG_SESSION_TYPE=tty
58
  _=/home/panda/miniconda3/envs/pda/bin/deepspeed
 
1
+ BROWSER=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/helpers/browser.sh
2
+ BUNDLED_DEBUGPY_PATH=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy
3
  COLORTERM=truecolor
4
  CONDA_DEFAULT_ENV=pda
5
  CONDA_EXE=/home/panda/miniconda3/bin/conda
 
7
  CONDA_PREFIX_1=/home/panda/miniconda3
8
  CONDA_PROMPT_MODIFIER=(pda)
9
  CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
10
+ CONDA_ROOT=/home/panda/miniconda3
11
  CONDA_SHLVL=2
12
  CROSS_RANK=0
13
  CROSS_SIZE=1
14
  CUDA_MODULE_LOADING=LAZY
15
  CUDA_VISIBLE_DEVICES=0,1
16
  DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
17
+ GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass.sh
18
  HOME=/home/panda
19
  LANG=en_US.UTF-8
20
  LOCAL_RANK=0
 
22
  LOGLEVEL=WARNING
23
  LOGNAME=panda
24
  MASTER_ADDR=127.0.0.1
25
+ MASTER_PORT=33558
26
  MOTD_SHOWN=pam
27
+ OLDPWD=/home/panda/pda-llm
28
+ PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/scripts/noConfigScripts
29
  PWD=/home/panda/pda-llm/scripts
30
+ PYDEVD_DISABLE_FILE_VALIDATION=1
31
  PYTHONHASHSEED=42
32
  PYTHONPATH=/home/panda/pda-llm
33
  RANK=0
34
  SHELL=/bin/sh
35
  SHLVL=4
36
+ SSH_CLIENT=10.103.69.12 60984 22
37
+ SSH_CONNECTION=10.103.69.12 60984 158.130.110.127 22
38
  SSL_CERT_DIR=/usr/lib/ssl/certs
39
  SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
40
  TERM=screen
41
  TERM_PROGRAM=vscode
42
+ TERM_PROGRAM_VERSION=1.96.4
43
+ TMUX=/tmp/tmux-1008/default,1150618,0
44
  TMUX_PANE=%0
45
  USER=panda
46
+ VSCODE_DEBUGPY_ADAPTER_ENDPOINTS=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/.noConfigDebugAdapterEndpoints/endpoint-c660b802f00341f2.txt
47
  VSCODE_GIT_ASKPASS_EXTRA_ARGS=
48
+ VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass-main.js
49
+ VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/node
50
  VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
51
+ VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-4fc226f5-2ffc-4d5b-b737-82848f81acd2.sock
52
  WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
53
  WANDB_ENTITY=alelab
54
  WANDB_MODE=online
55
+ WANDB_SERVICE=2-1151596-tcp-localhost-35011
56
  WORLD_SIZE=2
57
  XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
58
  XDG_RUNTIME_DIR=/run/user/1008
59
  XDG_SESSION_CLASS=user
60
+ XDG_SESSION_ID=3272
61
  XDG_SESSION_TYPE=tty
62
  _=/home/panda/miniconda3/envs/pda/bin/deepspeed
script.sh CHANGED
@@ -40,6 +40,8 @@ SAFETY_RATIO_TOL=10
40
  RESILIENT_COEFF=1
41
  LEARNING_RATE=1e-4
42
  EPOCHS=3
 
 
43
  while [[ "$#" -gt 0 ]]; do
44
  arg="$1"
45
  shift
@@ -107,6 +109,20 @@ while [[ "$#" -gt 0 ]]; do
107
  --epochs=*)
108
  EPOCHS="${arg#*=}"
109
  ;;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  --important_sft)
111
  IMPORTANT_SFT="$1"
112
  shift
@@ -121,7 +137,7 @@ while [[ "$#" -gt 0 ]]; do
121
  esac
122
  done
123
 
124
- OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}"
125
  mkdir -p "${OUTPUT_DIR}"
126
  OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
127
  if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
@@ -170,7 +186,7 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
170
  --model_name_or_path "${MODEL_NAME_OR_PATH}" \
171
  --cache_dir "${ROOT_DIR}/cache/sft-tools" \
172
  --important_sft "${IMPORTANT_SFT}" \
173
- --max_length 2048 \
174
  --trust_remote_code True \
175
  --epochs "${EPOCHS}" \
176
  --per_device_train_batch_size 1 \
@@ -189,9 +205,10 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
189
  --offload "${OFFLOAD}" \
190
  --safety_ratio_tol "${SAFETY_RATIO_TOL}" \
191
  --resilient_coeff "${RESILIENT_COEFF}" \
192
- --lora_r "16" \
193
  --lora_alpha "32" \
194
  --lora_dropout "0.05" \
 
195
  --bf16 True \
196
  --fp16 False \
197
  --tf32 False
 
40
  RESILIENT_COEFF=1
41
  LEARNING_RATE=1e-4
42
  EPOCHS=3
43
+ LORA_R=4
44
+ MAX_LENGTH=4096
45
  while [[ "$#" -gt 0 ]]; do
46
  arg="$1"
47
  shift
 
109
  --epochs=*)
110
  EPOCHS="${arg#*=}"
111
  ;;
112
+ --lora_r)
113
+ LORA_R="$1"
114
+ shift
115
+ ;;
116
+ --lora_r=*)
117
+ LORA_R="${arg#*=}"
118
+ ;;
119
+ --max_length)
120
+ MAX_LENGTH="$1"
121
+ shift
122
+ ;;
123
+ --max_length=*)
124
+ MAX_LENGTH="${arg#*=}"
125
+ ;;
126
  --important_sft)
127
  IMPORTANT_SFT="$1"
128
  shift
 
137
  esac
138
  done
139
 
140
+ OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}-${LORA_R}-${MAX_LENGTH}"
141
  mkdir -p "${OUTPUT_DIR}"
142
  OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
143
  if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
 
186
  --model_name_or_path "${MODEL_NAME_OR_PATH}" \
187
  --cache_dir "${ROOT_DIR}/cache/sft-tools" \
188
  --important_sft "${IMPORTANT_SFT}" \
189
+ --max_length "${MAX_LENGTH}" \
190
  --trust_remote_code True \
191
  --epochs "${EPOCHS}" \
192
  --per_device_train_batch_size 1 \
 
205
  --offload "${OFFLOAD}" \
206
  --safety_ratio_tol "${SAFETY_RATIO_TOL}" \
207
  --resilient_coeff "${RESILIENT_COEFF}" \
208
+ --lora_r "${LORA_R}" \
209
  --lora_alpha "32" \
210
  --lora_dropout "0.05" \
211
+ --gradient_checkpointing \
212
  --bf16 True \
213
  --fp16 False \
214
  --tf32 False
stderr.log CHANGED
The diff for this file is too large to render. See raw diff
 
stdout.log CHANGED
@@ -7,44 +7,44 @@ python version : 3.11.11
7
  CONDA_PREFIX : /home/panda/miniconda3/envs/pda
8
  deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed
9
  --------------------------------------------
10
- [2025-05-09 04:38:56,329] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
11
- [2025-05-09 04:38:58,379] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
12
  Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
13
- [2025-05-09 04:38:58,379] [INFO] [runner.py:605:main] cmd = /home/panda/miniconda3/envs/pda/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=58384 --module --enable_each_rank_log=None safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft true --max_length 2048 --trust_remote_code True --epochs 4 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-true-1-100 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 100 --resilient_coeff 1 --lora_r 16 --lora_alpha 32 --lora_dropout 0.05 --bf16 True --fp16 False --tf32 False
14
- [2025-05-09 04:38:59,543] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
15
- [2025-05-09 04:39:01,538] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
16
- [2025-05-09 04:39:01,538] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
17
- [2025-05-09 04:39:01,539] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
18
- [2025-05-09 04:39:01,539] [INFO] [launch.py:164:main] dist_world_size=2
19
- [2025-05-09 04:39:01,539] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
20
- [2025-05-09 04:39:01,539] [INFO] [launch.py:256:main] process 1843342 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=0', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '4', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-100', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '100', '--resilient_coeff', '1', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.05', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
21
- [2025-05-09 04:39:01,540] [INFO] [launch.py:256:main] process 1843343 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=1', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '4', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-100', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '100', '--resilient_coeff', '1', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.05', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
22
- [2025-05-09 04:39:02,715] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
23
- [2025-05-09 04:39:02,731] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
- [2025-05-09 04:39:05,820] [INFO] [comm.py:669:init_distributed] cdb=None
25
- [2025-05-09 04:39:05,848] [INFO] [comm.py:669:init_distributed] cdb=None
26
- [2025-05-09 04:39:05,848] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
27
  Set logger level to WARNING.
28
  calculating baseline ...
29
  calculating baseline ...
30
- Loading cached baseline logprobs from /home/panda/pda-llm/cache/sft-tools/cached_baseline_logprobs.pt
31
- Loaded cached baseline logprobs successfully
32
  ninja: no work to do.
33
- Time to load fused_adam op: 0.03549528121948242 seconds
34
- Time to load fused_adam op: 0.10125446319580078 seconds
 
 
 
35
  ***** Running training *****
36
 
37
  ***** Evaluating at the beginning *****
38
 
39
- ***** Evaluating at epoch 1/4 *****
40
-
41
- ***** Evaluating at epoch 2/4 *****
42
 
43
- ***** Evaluating at epoch 3/4 *****
44
 
45
- ***** Evaluating at epoch 4/4 *****
46
- Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-100" ...
47
  Saving Hugging Face Checkpoints...
 
48
  Model saved!
49
- [2025-05-09 08:48:57,409] [INFO] [launch.py:351:main] Process 1843343 exits successfully.
50
- [2025-05-09 08:49:01,410] [INFO] [launch.py:351:main] Process 1843342 exits successfully.
 
7
  CONDA_PREFIX : /home/panda/miniconda3/envs/pda
8
  deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed
9
  --------------------------------------------
10
+ [2025-05-11 13:53:40,343] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
11
+ [2025-05-11 13:53:42,372] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
12
  Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
13
+ [2025-05-11 13:53:42,373] [INFO] [runner.py:605:main] cmd = /home/panda/miniconda3/envs/pda/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=33558 --module --enable_each_rank_log=None safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft true --max_length 4096 --trust_remote_code True --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 50 --resilient_coeff 1 --lora_r 4 --lora_alpha 32 --lora_dropout 0.05 --gradient_checkpointing --bf16 True --fp16 False --tf32 False
14
+ [2025-05-11 13:53:43,535] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
15
+ [2025-05-11 13:53:45,544] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
16
+ [2025-05-11 13:53:45,544] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
17
+ [2025-05-11 13:53:45,544] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
18
+ [2025-05-11 13:53:45,544] [INFO] [launch.py:164:main] dist_world_size=2
19
+ [2025-05-11 13:53:45,544] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
20
+ [2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151596 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=0', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
21
+ [2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151597 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=1', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
22
+ [2025-05-11 13:53:46,697] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
23
+ [2025-05-11 13:53:46,717] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
+ [2025-05-11 13:53:49,799] [INFO] [comm.py:669:init_distributed] cdb=None
25
+ [2025-05-11 13:53:49,817] [INFO] [comm.py:669:init_distributed] cdb=None
26
+ [2025-05-11 13:53:49,817] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
27
  Set logger level to WARNING.
28
  calculating baseline ...
29
  calculating baseline ...
30
+ Computing baseline logprobs...
 
31
  ninja: no work to do.
32
+ Time to load fused_adam op: 0.034948110580444336 seconds
33
+ Saving computed baseline logprobs to /home/panda/pda-llm/cache/sft-tools/cached_baseline_logprobs.pt
34
+ Saved baseline logprobs successfully
35
+ ninja: no work to do.
36
+ Time to load fused_adam op: 0.0341794490814209 seconds
37
  ***** Running training *****
38
 
39
  ***** Evaluating at the beginning *****
40
 
41
+ ***** Evaluating at epoch 1/3 *****
 
 
42
 
43
+ ***** Evaluating at epoch 2/3 *****
44
 
45
+ ***** Evaluating at epoch 3/3 *****
46
+ Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
47
  Saving Hugging Face Checkpoints...
48
+ [2025-05-11 17:31:35,204] [INFO] [launch.py:351:main] Process 1151597 exits successfully.
49
  Model saved!
50
+ [2025-05-11 17:31:39,205] [INFO] [launch.py:351:main] Process 1151596 exits successfully.
 
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65b66ccdde0ce45c83b06f31e9f11272cade6aff26ec3cc9d6c49ede82b3ee2d
3
  size 17210383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:089fcc22ecea628694d8dcd8b57815f68d90d070e4012c9964a622d4473b14db
3
  size 17210383
tokenizer_config.json CHANGED
@@ -2074,7 +2074,7 @@
2074
  "input_ids",
2075
  "attention_mask"
2076
  ],
2077
- "model_max_length": 2048,
2078
  "pad_token": "<pad>",
2079
  "padding_side": "right",
2080
  "tokenizer_class": "PreTrainedTokenizer",
 
2074
  "input_ids",
2075
  "attention_mask"
2076
  ],
2077
+ "model_max_length": 4096,
2078
  "pad_token": "<pad>",
2079
  "padding_side": "right",
2080
  "tokenizer_class": "PreTrainedTokenizer",
wandb/debug-internal.log CHANGED
@@ -1,18 +1,18 @@
1
- {"time":"2025-05-09T04:39:40.712021704-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug-core.log"}
2
- {"time":"2025-05-09T04:39:40.918633322-04:00","level":"INFO","msg":"created new stream","id":"6qw6u685"}
3
- {"time":"2025-05-09T04:39:40.918678813-04:00","level":"INFO","msg":"stream: started","id":"6qw6u685"}
4
- {"time":"2025-05-09T04:39:40.918742684-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"6qw6u685"}
5
- {"time":"2025-05-09T04:39:40.918770585-04:00","level":"INFO","msg":"handler: started","stream_id":"6qw6u685"}
6
- {"time":"2025-05-09T04:39:40.918834586-04:00","level":"INFO","msg":"sender: started","stream_id":"6qw6u685"}
7
- {"time":"2025-05-09T04:39:41.131260096-04:00","level":"INFO","msg":"Starting system monitor"}
8
- {"time":"2025-05-09T04:39:41.131298987-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
9
- {"time":"2025-05-09T04:39:41.131358468-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
10
- {"time":"2025-05-09T08:48:57.268396663-04:00","level":"INFO","msg":"Stopping system monitor"}
11
- {"time":"2025-05-09T08:48:57.268495575-04:00","level":"INFO","msg":"Stopped system monitor"}
12
- {"time":"2025-05-09T08:48:57.766542844-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
- {"time":"2025-05-09T08:48:57.905789479-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
14
- {"time":"2025-05-09T08:48:57.924367836-04:00","level":"INFO","msg":"stream: closing","id":"6qw6u685"}
15
- {"time":"2025-05-09T08:48:57.924388046-04:00","level":"INFO","msg":"handler: closed","stream_id":"6qw6u685"}
16
- {"time":"2025-05-09T08:48:57.924400406-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6qw6u685"}
17
- {"time":"2025-05-09T08:48:57.924434197-04:00","level":"INFO","msg":"sender: closed","stream_id":"6qw6u685"}
18
- {"time":"2025-05-09T08:48:57.924510528-04:00","level":"INFO","msg":"stream: closed","id":"6qw6u685"}
 
1
+ {"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
2
+ {"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
3
+ {"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
4
+ {"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
5
+ {"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
6
+ {"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
7
+ {"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
9
+ {"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
10
+ {"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
14
+ {"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
15
+ {"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
16
+ {"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
17
+ {"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
18
+ {"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}
wandb/debug.log CHANGED
@@ -1,29 +1,29 @@
1
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
2
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_setup.py:_flush():68] Configure stats pid to 1843342
3
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
4
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
5
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug.log
7
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug-internal.log
8
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:init():852] calling init triggers
9
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
- config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 2048, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 100.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 4, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-100', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-100', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-09-04-39-40', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 118, 'total_training_steps': 472, '_wandb': {}}
11
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:init():893] starting backend
12
- 2025-05-09 04:39:40,707 INFO MainThread:1843342 [wandb_init.py:init():897] sending inform_init request
13
- 2025-05-09 04:39:40,709 INFO MainThread:1843342 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
- 2025-05-09 04:39:40,709 INFO MainThread:1843342 [wandb_init.py:init():907] backend started and connected
15
- 2025-05-09 04:39:40,711 INFO MainThread:1843342 [wandb_init.py:init():1002] updated telemetry
16
- 2025-05-09 04:39:40,717 INFO MainThread:1843342 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
17
- 2025-05-09 04:39:41,128 INFO MainThread:1843342 [wandb_init.py:init():1101] starting run threads in backend
18
- 2025-05-09 04:39:41,193 INFO MainThread:1843342 [wandb_run.py:_console_start():2566] atexit reg
19
- 2025-05-09 04:39:41,193 INFO MainThread:1843342 [wandb_run.py:_redirect():2414] redirect: wrap_raw
20
- 2025-05-09 04:39:41,193 INFO MainThread:1843342 [wandb_run.py:_redirect():2483] Wrapping output streams.
21
- 2025-05-09 04:39:41,193 INFO MainThread:1843342 [wandb_run.py:_redirect():2506] Redirects installed.
22
- 2025-05-09 04:39:41,194 INFO MainThread:1843342 [wandb_init.py:init():1147] run started, returning control to user process
23
- 2025-05-09 08:48:57,266 INFO MainThread:1843342 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/6qw6u685
24
- 2025-05-09 08:48:57,267 INFO MainThread:1843342 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
25
- 2025-05-09 08:48:57,267 INFO MainThread:1843342 [wandb_run.py:_restore():2513] restore
26
- 2025-05-09 08:48:57,267 INFO MainThread:1843342 [wandb_run.py:_restore():2519] restore done
27
- 2025-05-09 08:48:57,911 INFO MainThread:1843342 [wandb_run.py:_footer_history_summary_info():4160] rendering history
28
- 2025-05-09 08:48:57,914 INFO MainThread:1843342 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
29
- 2025-05-09 08:48:57,923 INFO MainThread:1843342 [wandb_run.py:_footer_sync_info():4121] logging synced files
 
1
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
2
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
3
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
4
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
5
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
7
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
8
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():852] calling init triggers
9
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
11
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():893] starting backend
12
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-05-11 14:32:04,250 INFO MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-05-11 14:32:04,250 INFO MainThread:1151596 [wandb_init.py:init():907] backend started and connected
15
+ 2025-05-11 14:32:04,252 INFO MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
16
+ 2025-05-11 14:32:04,257 INFO MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
17
+ 2025-05-11 14:32:04,568 INFO MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
18
+ 2025-05-11 14:32:04,636 INFO MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
19
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
20
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
21
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
22
+ 2025-05-11 14:32:04,638 INFO MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
23
+ 2025-05-11 17:31:35,325 INFO MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
24
+ 2025-05-11 17:31:35,326 INFO MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
25
+ 2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2513] restore
26
+ 2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2519] restore done
27
+ 2025-05-11 17:31:35,957 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
28
+ 2025-05-11 17:31:35,964 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
29
+ 2025-05-11 17:31:35,968 INFO MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files
wandb/run-20250511_143204-ws6emydu/files/config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.10
4
+ m: []
5
+ python_version: 3.11.11
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 49
11
+ - 51
12
+ - 55
13
+ - 71
14
+ - 98
15
+ "2":
16
+ - 1
17
+ - 11
18
+ - 49
19
+ - 51
20
+ - 55
21
+ - 71
22
+ - 98
23
+ "3":
24
+ - 2
25
+ - 13
26
+ - 16
27
+ - 23
28
+ - 55
29
+ - 61
30
+ "4": 3.11.11
31
+ "5": 0.19.10
32
+ "6": 4.51.3
33
+ "8":
34
+ - 5
35
+ "12": 0.19.10
36
+ "13": linux-x86_64
37
+ bf16:
38
+ value: true
39
+ cache_dir:
40
+ value: /home/panda/pda-llm/cache/sft-tools
41
+ deepscale:
42
+ value: false
43
+ deepscale_config:
44
+ value: null
45
+ deepspeed:
46
+ value: false
47
+ deepspeed_config:
48
+ value: null
49
+ device:
50
+ value: cuda:0
51
+ epochs:
52
+ value: 3
53
+ eval_datasets:
54
+ value: null
55
+ eval_interval:
56
+ value: 1000000
57
+ eval_split_ratio:
58
+ value: null
59
+ eval_strategy:
60
+ value: epoch
61
+ fp16:
62
+ value: false
63
+ global_rank:
64
+ value: 0
65
+ gradient_accumulation_steps:
66
+ value: 48
67
+ gradient_checkpointing:
68
+ value: true
69
+ important_sft:
70
+ value: true
71
+ local_rank:
72
+ value: 0
73
+ log_dir:
74
+ value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
75
+ log_project:
76
+ value: TOOLS-SFT
77
+ log_run_name:
78
+ value: tools-sft-2025-05-11-14-32-03
79
+ log_type:
80
+ value: wandb
81
+ lora_alpha:
82
+ value: 32
83
+ lora_dropout:
84
+ value: 0.05
85
+ lora_r:
86
+ value: 4
87
+ lr:
88
+ value: 0.0001
89
+ lr_scheduler_type:
90
+ value: COSINE
91
+ lr_warmup_ratio:
92
+ value: 0.1
93
+ max_length:
94
+ value: 4096
95
+ model_name_or_path:
96
+ value: meta-llama/Llama-3.1-8B-Instruct
97
+ need_eval:
98
+ value: true
99
+ num_update_steps_per_epoch:
100
+ value: 112
101
+ offload:
102
+ value: none
103
+ output_dir:
104
+ value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
105
+ per_device_eval_batch_size:
106
+ value: 1
107
+ per_device_train_batch_size:
108
+ value: 1
109
+ recompute_baseline:
110
+ value: false
111
+ resilient_coeff:
112
+ value: 1
113
+ safety_ratio_tol:
114
+ value: 50
115
+ save_16bit:
116
+ value: false
117
+ save_interval:
118
+ value: 1000000
119
+ seed:
120
+ value: 42
121
+ tf32:
122
+ value: false
123
+ total_training_steps:
124
+ value: 336
125
+ train_datasets:
126
+ value:
127
+ - - tools
128
+ - proportion: 1
129
+ trust_remote_code:
130
+ value: true
131
+ weight_decay:
132
+ value: 0
133
+ zero_stage:
134
+ value: 0
wandb/run-20250511_143204-ws6emydu/files/output.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 3/3 epoch (loss 0.4680): 100%|██████████| 16050/16050 [2:59:28<00:00, 1.49it/s]
3
+
4
+ ***** Evaluating at the beginning *****
5
+
6
+ ***** Evaluating at epoch 1/3 *****
7
+
8
+ ***** Evaluating at epoch 2/3 *****
9
+
10
+ ***** Evaluating at epoch 3/3 *****
11
+ Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
12
+ Saving Hugging Face Checkpoints...
13
+ /home/panda/miniconda3/envs/pda/lib/python3.11/site-packages/peft/utils/save_and_load.py:220: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
14
+ warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
15
+ Model saved!
wandb/run-20250511_143204-ws6emydu/files/requirements.txt ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PySocks==1.7.1
2
+ pip==25.1.1
3
+ certifi==2025.4.26
4
+ parso==0.8.4
5
+ wcwidth==0.2.13
6
+ nvidia-ml-py==12.535.133
7
+ mkl_fft==1.3.11
8
+ urllib3==2.3.0
9
+ charset-normalizer==3.3.2
10
+ transformers==4.51.3
11
+ smmap==4.0.0
12
+ xxhash==3.5.0
13
+ etils==1.12.2
14
+ platformdirs==4.3.8
15
+ tzdata==2025.2
16
+ ipython==9.2.0
17
+ deepspeed==0.16.7
18
+ gmpy2==2.2.1
19
+ importlib_resources==6.5.2
20
+ pydantic==2.10.3
21
+ tqdm==4.67.1
22
+ pyparsing==3.2.0
23
+ exceptiongroup==1.2.2
24
+ tokenizers==0.21.1
25
+ Markdown==3.8
26
+ matplotlib-inline==0.1.7
27
+ dm_control==1.0.30
28
+ jupyter_core==5.7.2
29
+ pydantic_core==2.27.1
30
+ hjson==3.1.0
31
+ mkl_random==1.2.8
32
+ ipykernel==6.29.5
33
+ PyOpenGL==3.1.9
34
+ sentry-sdk==2.18.0
35
+ wandb==0.19.10
36
+ einops==0.8.1
37
+ prompt_toolkit==3.0.51
38
+ Bottleneck==1.4.2
39
+ msgpack==1.1.0
40
+ fsspec==2024.12.0
41
+ labmaze==1.0.6
42
+ mdurl==0.1.0
43
+ executing==2.2.0
44
+ wheel==0.45.1
45
+ bitsandbytes==0.45.5
46
+ networkx==3.4.2
47
+ comm==0.2.2
48
+ asttokens==3.0.0
49
+ nvitop==1.5.0
50
+ sympy==1.13.1
51
+ MarkupSafe==3.0.2
52
+ numexpr==2.10.1
53
+ tensorboard_data_server==0.7.0
54
+ markdown-it-py==2.2.0
55
+ wrapt==1.17.2
56
+ six==1.17.0
57
+ idna==3.7
58
+ docker-pycreds==0.4.0
59
+ jedi==0.19.2
60
+ GitPython==3.1.43
61
+ dm-env==1.6
62
+ pure_eval==0.2.3
63
+ ninja==1.11.1.4
64
+ setuptools==80.1.0
65
+ fonttools==4.55.3
66
+ annotated-types==0.6.0
67
+ psutil==7.0.0
68
+ cycler==0.11.0
69
+ py-cpuinfo==9.0.0
70
+ accelerate==1.6.0
71
+ dm-tree==0.1.9
72
+ filelock==3.17.0
73
+ pytz==2024.1
74
+ rich==13.9.4
75
+ appdirs==1.4.4
76
+ click==8.1.8
77
+ Jinja2==3.1.6
78
+ unicodedata2==15.1.0
79
+ pillow==11.1.0
80
+ glfw==2.9.0
81
+ traitlets==5.14.3
82
+ stack_data==0.6.3
83
+ peft==0.15.2
84
+ python-dateutil==2.9.0.post0
85
+ requests==2.32.3
86
+ attrs==25.3.0
87
+ Werkzeug==3.1.3
88
+ gitdb==4.0.7
89
+ lxml==5.4.0
90
+ torch==2.5.1
91
+ scipy==1.15.2
92
+ sentencepiece==0.2.0
93
+ huggingface-hub==0.31.1
94
+ dill==0.3.8
95
+ pexpect==4.9.0
96
+ pickleshare==0.7.5
97
+ ptyprocess==0.7.0
98
+ contourpy==1.3.1
99
+ grpcio==1.71.0
100
+ optree==0.14.1
101
+ safetensors==0.5.3
102
+ mpmath==1.3.0
103
+ nest_asyncio==1.6.0
104
+ pyarrow==19.0.0
105
+ PyYAML==6.0.2
106
+ decorator==5.2.1
107
+ pandas==2.2.3
108
+ tensorboard==2.19.0
109
+ zipp==3.21.0
110
+ mujoco==3.3.2
111
+ regex==2024.11.6
112
+ Brotli==1.0.9
113
+ mkl-service==2.4.0
114
+ matplotlib==3.10.0
115
+ Pygments==2.19.1
116
+ absl-py==2.1.0
117
+ hf-xet==1.1.0
118
+ packaging==25.0
119
+ jupyter_client==8.6.3
120
+ triton==3.1.0
121
+ multiprocess==0.70.15
122
+ debugpy==1.8.14
123
+ numpy==2.0.1
124
+ protobuf==5.29.3
125
+ tornado==6.4.2
126
+ datasets==3.6.0
127
+ eval_type_backport==0.2.2
128
+ typing_extensions==4.12.2
129
+ kiwisolver==1.4.8
130
+ pyzmq==26.4.0
131
+ ipython_pygments_lexers==1.1.1
132
+ setproctitle==1.2.2
133
+ importlib_metadata==8.6.1
134
+ jaraco.text==3.12.1
135
+ backports.tarfile==1.2.0
136
+ importlib_metadata==8.0.0
137
+ jaraco.context==5.3.0
138
+ more-itertools==10.3.0
139
+ wheel==0.45.1
140
+ jaraco.collections==5.1.0
141
+ platformdirs==4.2.2
142
+ autocommand==2.2.2
143
+ zipp==3.19.2
144
+ tomli==2.0.1
145
+ typeguard==4.3.0
146
+ packaging==24.2
147
+ inflect==7.3.1
148
+ typing_extensions==4.12.2
149
+ jaraco.functools==4.0.1
wandb/run-20250511_143204-ws6emydu/files/wandb-metadata.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-138-generic-x86_64-with-glibc2.31",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-05-11T18:32:04.251126Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--train_datasets",
8
+ "tools",
9
+ "--model_name_or_path",
10
+ "meta-llama/Llama-3.1-8B-Instruct",
11
+ "--cache_dir",
12
+ "/home/panda/pda-llm/cache/sft-tools",
13
+ "--important_sft",
14
+ "true",
15
+ "--max_length",
16
+ "4096",
17
+ "--trust_remote_code",
18
+ "True",
19
+ "--epochs",
20
+ "3",
21
+ "--per_device_train_batch_size",
22
+ "1",
23
+ "--per_device_eval_batch_size",
24
+ "1",
25
+ "--gradient_accumulation_steps",
26
+ "48",
27
+ "--gradient_checkpointing",
28
+ "--learning_rate",
29
+ "1e-4",
30
+ "--lr_scheduler_type",
31
+ "cosine",
32
+ "--lr_warmup_ratio",
33
+ "0.1",
34
+ "--weight_decay",
35
+ "0.0",
36
+ "--seed",
37
+ "42",
38
+ "--output_dir",
39
+ "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
40
+ "--log_type",
41
+ "wandb",
42
+ "--log_project",
43
+ "TOOLS-SFT",
44
+ "--zero_stage",
45
+ "0",
46
+ "--offload",
47
+ "none",
48
+ "--safety_ratio_tol",
49
+ "50",
50
+ "--resilient_coeff",
51
+ "1",
52
+ "--lora_r",
53
+ "4",
54
+ "--lora_alpha",
55
+ "32",
56
+ "--lora_dropout",
57
+ "0.05",
58
+ "--gradient_checkpointing",
59
+ "--bf16",
60
+ "True",
61
+ "--fp16",
62
+ "False",
63
+ "--tf32",
64
+ "False"
65
+ ],
66
+ "program": "-m safe_rlhf.algorithms.tools_ft.__main__",
67
+ "git": {
68
+ "remote": "git@github.com:IgnacioBoero/pda-llm.git",
69
+ "commit": "f4c70e99280e869cd565464446c9942daaf22425"
70
+ },
71
+ "email": "iboero@upenn.edu",
72
+ "root": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
73
+ "host": "plaza",
74
+ "executable": "/home/panda/miniconda3/envs/pda/bin/python",
75
+ "cpu_count": 24,
76
+ "cpu_count_logical": 48,
77
+ "gpu": "NVIDIA RTX A6000",
78
+ "gpu_count": 2,
79
+ "disk": {
80
+ "/": {
81
+ "total": "982820896768",
82
+ "used": "915338317824"
83
+ }
84
+ },
85
+ "memory": {
86
+ "total": "270090010624"
87
+ },
88
+ "cpu": {
89
+ "count": 24,
90
+ "countLogical": 48
91
+ },
92
+ "gpu_nvidia": [
93
+ {
94
+ "name": "NVIDIA RTX A6000",
95
+ "memoryTotal": "51527024640",
96
+ "cudaCores": 10752,
97
+ "architecture": "Ampere"
98
+ },
99
+ {
100
+ "name": "NVIDIA RTX A6000",
101
+ "memoryTotal": "51527024640",
102
+ "cudaCores": 10752,
103
+ "architecture": "Ampere"
104
+ }
105
+ ],
106
+ "cudaVersion": "12.4"
107
+ }
wandb/run-20250511_143204-ws6emydu/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":10771.076681318,"_step":16050,"train/step":16050,"eval/hist_log_ratio":{"values":[1,1,2,0,2,0,1,2,0,3,4,5,10,4,2,7,5,6,2,3,6,9,6,6,13,5,1,2,8,5,6,6,4,5,4,6,3,2,4,4,3,0,0,4,1,2,4,2,0,0,3,0,3,2,4,0,3,1,0,0,1,1,0,1],"bins":[77.5,81.5,85.5,89.5,93.5,97.5,101.5625,105.5625,109.5625,113.5625,117.5625,121.625,125.625,129.625,133.625,137.625,141.625,145.625,149.625,153.625,157.625,161.75,165.75,169.75,173.75,177.75,181.75,185.75,189.75,193.75,197.75,201.75,205.75,209.75,213.75,217.75,221.75,225.75,229.75,233.75,237.75,241.875,245.875,249.875,253.875,258,262,266,270,274,278,282,286,290,294,298,302,306,310,314,318,322,326,330,334],"_type":"histogram"},"train/loss":0.4680471122264862,"eval/step":16050,"eval/min_log_ratio":77.5,"eval/max_log_ratio":334,"_timestamp":1.7469990929702783e+09,"_wandb":{"runtime":10771},"train/lr":1.0749763901607423e-08,"eval/mean_important_log_ratio":188,"train/epoch":3}
wandb/run-20250511_143204-ws6emydu/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-05-11T14:32:03.899110462-04:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmplwnn6mri/port-1151596.txt","pid":1151596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-05-11T14:32:03.905019894-04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1151596}
3
+ {"time":"2025-05-11T14:32:03.904995063-04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35011,"Zone":""}}
4
+ {"time":"2025-05-11T14:32:04.083172335-04:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37156"}
5
+ {"time":"2025-05-11T14:32:04.25319672-04:00","level":"INFO","msg":"handleInformInit: received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
6
+ {"time":"2025-05-11T14:32:04.439955459-04:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"ws6emydu","id":"127.0.0.1:37156"}
7
+ {"time":"2025-05-11T17:31:35.968564433-04:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
8
+ {"time":"2025-05-11T17:31:35.968735567-04:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"ws6emydu","id":"127.0.0.1:37156"}
9
+ {"time":"2025-05-11T17:31:36.95829108-04:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:37156"}
10
+ {"time":"2025-05-11T17:31:36.958316571-04:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:37156"}
11
+ {"time":"2025-05-11T17:31:36.958328081-04:00","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-05-11T17:31:36.958329541-04:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:37156"}
13
+ {"time":"2025-05-11T17:31:36.958416902-04:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:37156"}
14
+ {"time":"2025-05-11T17:31:36.958428293-04:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:37156"}
15
+ {"time":"2025-05-11T17:31:36.958438183-04:00","level":"INFO","msg":"server is closed"}
wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
2
+ {"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
3
+ {"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
4
+ {"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
5
+ {"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
6
+ {"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
7
+ {"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
9
+ {"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
10
+ {"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
14
+ {"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
15
+ {"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
16
+ {"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
17
+ {"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
18
+ {"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}
wandb/run-20250511_143204-ws6emydu/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
2
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
3
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
4
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
5
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
6
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
7
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
8
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():852] calling init triggers
9
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
10
+ config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
11
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():893] starting backend
12
+ 2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
13
+ 2025-05-11 14:32:04,250 INFO MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-05-11 14:32:04,250 INFO MainThread:1151596 [wandb_init.py:init():907] backend started and connected
15
+ 2025-05-11 14:32:04,252 INFO MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
16
+ 2025-05-11 14:32:04,257 INFO MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
17
+ 2025-05-11 14:32:04,568 INFO MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
18
+ 2025-05-11 14:32:04,636 INFO MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
19
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
20
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
21
+ 2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
22
+ 2025-05-11 14:32:04,638 INFO MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
23
+ 2025-05-11 17:31:35,325 INFO MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
24
+ 2025-05-11 17:31:35,326 INFO MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
25
+ 2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2513] restore
26
+ 2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2519] restore done
27
+ 2025-05-11 17:31:35,957 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
28
+ 2025-05-11 17:31:35,964 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
29
+ 2025-05-11 17:31:35,968 INFO MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files
wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75c95736964eb77dd5d326d209df011e412e887998fb330d51e0946a94f7cc4
3
+ size 14008017