iboero16 commited on May 11, 2025

Commit

0b57d90

verified ·

1 Parent(s): 59dedf9

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitattributes +1 -0
adapter_config.json +6 -6
adapter_model.safetensors +2 -2
arguments.json +9 -9
arguments.pkl +2 -2
config.json +15 -0
environ.txt +18 -14
script.sh +20 -3
stderr.log +0 -0
stdout.log +28 -28
tokenizer.json +1 -1
tokenizer_config.json +1 -1
wandb/debug-internal.log +18 -18
wandb/debug.log +29 -29
wandb/run-20250511_143204-ws6emydu/files/config.yaml +134 -0
wandb/run-20250511_143204-ws6emydu/files/output.log +15 -0
wandb/run-20250511_143204-ws6emydu/files/requirements.txt +149 -0
wandb/run-20250511_143204-ws6emydu/files/wandb-metadata.json +107 -0
wandb/run-20250511_143204-ws6emydu/files/wandb-summary.json +1 -0
wandb/run-20250511_143204-ws6emydu/logs/debug-core.log +15 -0
wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log +18 -0
wandb/run-20250511_143204-ws6emydu/logs/debug.log +29 -0
wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
 wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text

 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
 wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb filter=lfs diff=lfs merge=lfs -text

adapter_config.json CHANGED Viewed

@@ -23,18 +23,18 @@
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "r": 16,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "down_proj",
     "lm_head",
-    "gate_proj",
-    "v_proj",
-    "k_proj",
-    "up_proj",
     "o_proj",
-    "q_proj"
   ],
   "task_type": null,
   "trainable_token_indices": null,

   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "r": 4,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
+    "gate_proj",
     "down_proj",
     "lm_head",
     "o_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj"
   ],
   "task_type": null,
   "trainable_token_indices": null,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8fbfeadb937f621c6832145b91c7d94404fe8ee099eb497211fb6951fdde1b03
-size 1138873304

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ceb99c3a3c514adbaf1adaf7b3d1cfecc7c2f0b42aa5a3066b47d9b60c54536
+size 1072781800

arguments.json CHANGED Viewed

@@ -2,7 +2,7 @@
     "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
     "recompute_baseline": false,
     "cache_dir": "/home/panda/pda-llm/cache/sft-tools",
-    "max_length": 2048,
     "trust_remote_code": true,
     "train_datasets": [
         [
@@ -13,10 +13,10 @@
         ]
     ],
     "eval_datasets": null,
-    "safety_ratio_tol": 100.0,
     "important_sft": true,
     "resilient_coeff": 1.0,
-    "epochs": 4,
     "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 1,
     "gradient_accumulation_steps": 48,
@@ -29,18 +29,18 @@
     "fp16": false,
     "bf16": true,
     "tf32": false,
-    "lora_r": 16,
     "lora_alpha": 32,
     "lora_dropout": 0.05,
     "eval_strategy": "epoch",
     "eval_interval": 1000000,
     "need_eval": true,
     "eval_split_ratio": null,
-    "output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-100",
     "log_type": "wandb",
-    "log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-100",
     "log_project": "TOOLS-SFT",
-    "log_run_name": "tools-sft-2025-05-09-04-39-40",
     "save_16bit": false,
     "save_interval": 1000000,
     "local_rank": 0,
@@ -55,6 +55,6 @@
         "type": "torch.device",
         "repr": "device(type='cuda', index=0)"
     },
-    "num_update_steps_per_epoch": 118,
-    "total_training_steps": 472
 }

     "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
     "recompute_baseline": false,
     "cache_dir": "/home/panda/pda-llm/cache/sft-tools",
+    "max_length": 4096,
     "trust_remote_code": true,
     "train_datasets": [
         [
         ]
     ],
     "eval_datasets": null,
+    "safety_ratio_tol": 50.0,
     "important_sft": true,
     "resilient_coeff": 1.0,
+    "epochs": 3,
     "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 1,
     "gradient_accumulation_steps": 48,
     "fp16": false,
     "bf16": true,
     "tf32": false,
+    "lora_r": 4,
     "lora_alpha": 32,
     "lora_dropout": 0.05,
     "eval_strategy": "epoch",
     "eval_interval": 1000000,
     "need_eval": true,
     "eval_split_ratio": null,
+    "output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
     "log_type": "wandb",
+    "log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
     "log_project": "TOOLS-SFT",
+    "log_run_name": "tools-sft-2025-05-11-14-32-03",
     "save_16bit": false,
     "save_interval": 1000000,
     "local_rank": 0,
         "type": "torch.device",
         "repr": "device(type='cuda', index=0)"
     },
+    "num_update_steps_per_epoch": 112,
+    "total_training_steps": 336
 }

arguments.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d2d1b5c0aceee6d5d6b2fff2845cdd64294c50293b9d08c3ccd0ff23a70879ef
-size 1225

 version https://git-lfs.github.com/spec/v1
+oid sha256:d44fd7cf7fb68b61cf52e153fc0789f3d95aa166ebbb56b1832e0b98a1536149
+size 1231

config.json CHANGED Viewed

@@ -20,6 +20,21 @@
   "num_key_value_heads": 8,
   "pad_token_id": 128256,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,

   "num_key_value_heads": 8,
   "pad_token_id": 128256,
   "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,

environ.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-BROWSER=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/bin/helpers/browser.sh
 COLORTERM=truecolor
 CONDA_DEFAULT_ENV=pda
 CONDA_EXE=/home/panda/miniconda3/bin/conda
@@ -6,13 +7,14 @@ CONDA_PREFIX=/home/panda/miniconda3/envs/pda
 CONDA_PREFIX_1=/home/panda/miniconda3
 CONDA_PROMPT_MODIFIER=(pda)
 CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
 CONDA_SHLVL=2
 CROSS_RANK=0
 CROSS_SIZE=1
 CUDA_MODULE_LOADING=LAZY
 CUDA_VISIBLE_DEVICES=0,1
 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
-GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/extensions/git/dist/askpass.sh
 HOME=/home/panda
 LANG=en_US.UTF-8
 LOCAL_RANK=0
@@ -20,39 +22,41 @@ LOCAL_SIZE=2
 LOGLEVEL=WARNING
 LOGNAME=panda
 MASTER_ADDR=127.0.0.1
-MASTER_PORT=58384
 MOTD_SHOWN=pam
-OLDPWD=/home/panda
-PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/panda/.vscode-server/data/User/globalStorage/github.copilot-chat/debugCommand
 PWD=/home/panda/pda-llm/scripts
 PYTHONHASHSEED=42
 PYTHONPATH=/home/panda/pda-llm
 RANK=0
 SHELL=/bin/sh
 SHLVL=4
-SSH_CLIENT=10.100.129.163 58321 22
-SSH_CONNECTION=10.100.129.163 58321 158.130.110.127 22
 SSL_CERT_DIR=/usr/lib/ssl/certs
 SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
 TERM=screen
 TERM_PROGRAM=vscode
-TERM_PROGRAM_VERSION=1.99.2
-TMUX=/tmp//tmux-1008/default,750869,0
 TMUX_PANE=%0
 USER=panda
 VSCODE_GIT_ASKPASS_EXTRA_ARGS=
-VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/extensions/git/dist/askpass-main.js
-VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-4949701c880d4bdb949e3c0e6b400288da7f474b/server/node
 VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
-VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-6dd97596-0fde-4c4b-bbcb-61896b21f983.sock
 WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
 WANDB_ENTITY=alelab
 WANDB_MODE=online
-WANDB_SERVICE=2-1843342-tcp-localhost-45547
 WORLD_SIZE=2
 XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
 XDG_RUNTIME_DIR=/run/user/1008
 XDG_SESSION_CLASS=user
-XDG_SESSION_ID=1945
 XDG_SESSION_TYPE=tty
 _=/home/panda/miniconda3/envs/pda/bin/deepspeed

+BROWSER=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/helpers/browser.sh
+BUNDLED_DEBUGPY_PATH=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy
 COLORTERM=truecolor
 CONDA_DEFAULT_ENV=pda
 CONDA_EXE=/home/panda/miniconda3/bin/conda
 CONDA_PREFIX_1=/home/panda/miniconda3
 CONDA_PROMPT_MODIFIER=(pda)
 CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
+CONDA_ROOT=/home/panda/miniconda3
 CONDA_SHLVL=2
 CROSS_RANK=0
 CROSS_SIZE=1
 CUDA_MODULE_LOADING=LAZY
 CUDA_VISIBLE_DEVICES=0,1
 DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
+GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass.sh
 HOME=/home/panda
 LANG=en_US.UTF-8
 LOCAL_RANK=0
 LOGLEVEL=WARNING
 LOGNAME=panda
 MASTER_ADDR=127.0.0.1
+MASTER_PORT=33558
 MOTD_SHOWN=pam
+OLDPWD=/home/panda/pda-llm
+PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/scripts/noConfigScripts
 PWD=/home/panda/pda-llm/scripts
+PYDEVD_DISABLE_FILE_VALIDATION=1
 PYTHONHASHSEED=42
 PYTHONPATH=/home/panda/pda-llm
 RANK=0
 SHELL=/bin/sh
 SHLVL=4
+SSH_CLIENT=10.103.69.12 60984 22
+SSH_CONNECTION=10.103.69.12 60984 158.130.110.127 22
 SSL_CERT_DIR=/usr/lib/ssl/certs
 SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
 TERM=screen
 TERM_PROGRAM=vscode
+TERM_PROGRAM_VERSION=1.96.4
+TMUX=/tmp/tmux-1008/default,1150618,0
 TMUX_PANE=%0
 USER=panda
+VSCODE_DEBUGPY_ADAPTER_ENDPOINTS=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/.noConfigDebugAdapterEndpoints/endpoint-c660b802f00341f2.txt
 VSCODE_GIT_ASKPASS_EXTRA_ARGS=
+VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass-main.js
+VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/node
 VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
+VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-4fc226f5-2ffc-4d5b-b737-82848f81acd2.sock
 WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
 WANDB_ENTITY=alelab
 WANDB_MODE=online
+WANDB_SERVICE=2-1151596-tcp-localhost-35011
 WORLD_SIZE=2
 XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
 XDG_RUNTIME_DIR=/run/user/1008
 XDG_SESSION_CLASS=user
+XDG_SESSION_ID=3272
 XDG_SESSION_TYPE=tty
 _=/home/panda/miniconda3/envs/pda/bin/deepspeed

script.sh CHANGED Viewed

@@ -40,6 +40,8 @@ SAFETY_RATIO_TOL=10
 RESILIENT_COEFF=1
 LEARNING_RATE=1e-4
 EPOCHS=3
 while [[ "$#" -gt 0 ]]; do
 	arg="$1"
 	shift
@@ -107,6 +109,20 @@ while [[ "$#" -gt 0 ]]; do
 		--epochs=*)
 			EPOCHS="${arg#*=}"
 			;;
 		--important_sft)
 			IMPORTANT_SFT="$1"
 			shift
@@ -121,7 +137,7 @@ while [[ "$#" -gt 0 ]]; do
 	esac
 done
-OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}"
 mkdir -p "${OUTPUT_DIR}"
 OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
 if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
@@ -170,7 +186,7 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
 	--model_name_or_path "${MODEL_NAME_OR_PATH}" \
 	--cache_dir "${ROOT_DIR}/cache/sft-tools" \
 	--important_sft "${IMPORTANT_SFT}"	 \
-	--max_length 2048 \
 	--trust_remote_code True \
 	--epochs "${EPOCHS}"  \
 	--per_device_train_batch_size 1 \
@@ -189,9 +205,10 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
 	--offload "${OFFLOAD}" \
 	--safety_ratio_tol "${SAFETY_RATIO_TOL}" \
 	--resilient_coeff "${RESILIENT_COEFF}" \
-	--lora_r "16" \
 	--lora_alpha "32" \
 	--lora_dropout "0.05" \
 	--bf16 True \
 	--fp16 False \
 	--tf32 False

 RESILIENT_COEFF=1
 LEARNING_RATE=1e-4
 EPOCHS=3
+LORA_R=4
+MAX_LENGTH=4096
 while [[ "$#" -gt 0 ]]; do
 	arg="$1"
 	shift
 		--epochs=*)
 			EPOCHS="${arg#*=}"
 			;;
+		--lora_r)
+			LORA_R="$1"
+			shift
+			;;
+		--lora_r=*)
+			LORA_R="${arg#*=}"
+			;;
+		--max_length)
+			MAX_LENGTH="$1"
+			shift
+			;;
+		--max_length=*)
+			MAX_LENGTH="${arg#*=}"
+			;;
 		--important_sft)
 			IMPORTANT_SFT="$1"
 			shift
 	esac
 done
+OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}-${LORA_R}-${MAX_LENGTH}"
 mkdir -p "${OUTPUT_DIR}"
 OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
 if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
 	--model_name_or_path "${MODEL_NAME_OR_PATH}" \
 	--cache_dir "${ROOT_DIR}/cache/sft-tools" \
 	--important_sft "${IMPORTANT_SFT}"	 \
+	--max_length "${MAX_LENGTH}" \
 	--trust_remote_code True \
 	--epochs "${EPOCHS}"  \
 	--per_device_train_batch_size 1 \
 	--offload "${OFFLOAD}" \
 	--safety_ratio_tol "${SAFETY_RATIO_TOL}" \
 	--resilient_coeff "${RESILIENT_COEFF}" \
+	--lora_r "${LORA_R}" \
 	--lora_alpha "32" \
 	--lora_dropout "0.05" \
+	--gradient_checkpointing \
 	--bf16 True \
 	--fp16 False \
 	--tf32 False

stderr.log CHANGED Viewed

The diff for this file is too large to render. See raw diff

stdout.log CHANGED Viewed

@@ -7,44 +7,44 @@ python version : 3.11.11
 CONDA_PREFIX   : /home/panda/miniconda3/envs/pda
 deepspeed:    /home/panda/miniconda3/envs/pda/bin/deepspeed
 --------------------------------------------
-[2025-05-09 04:38:56,329] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2025-05-09 04:38:58,379] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
 Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
-[2025-05-09 04:38:58,379] [INFO] [runner.py:605:main] cmd = /home/panda/miniconda3/envs/pda/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=58384 --module --enable_each_rank_log=None safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft true --max_length 2048 --trust_remote_code True --epochs 4 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-true-1-100 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 100 --resilient_coeff 1 --lora_r 16 --lora_alpha 32 --lora_dropout 0.05 --bf16 True --fp16 False --tf32 False
-[2025-05-09 04:38:59,543] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2025-05-09 04:39:01,538] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
-[2025-05-09 04:39:01,538] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
-[2025-05-09 04:39:01,539] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
-[2025-05-09 04:39:01,539] [INFO] [launch.py:164:main] dist_world_size=2
-[2025-05-09 04:39:01,539] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
-[2025-05-09 04:39:01,539] [INFO] [launch.py:256:main] process 1843342 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=0', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '4', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-100', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '100', '--resilient_coeff', '1', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.05', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
-[2025-05-09 04:39:01,540] [INFO] [launch.py:256:main] process 1843343 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=1', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '2048', '--trust_remote_code', 'True', '--epochs', '4', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-100', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '100', '--resilient_coeff', '1', '--lora_r', '16', '--lora_alpha', '32', '--lora_dropout', '0.05', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
-[2025-05-09 04:39:02,715] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2025-05-09 04:39:02,731] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[2025-05-09 04:39:05,820] [INFO] [comm.py:669:init_distributed] cdb=None
-[2025-05-09 04:39:05,848] [INFO] [comm.py:669:init_distributed] cdb=None
-[2025-05-09 04:39:05,848] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
 Set logger level to WARNING.
 calculating baseline ...
 calculating baseline ...
-Loading cached baseline logprobs from /home/panda/pda-llm/cache/sft-tools/cached_baseline_logprobs.pt
-Loaded cached baseline logprobs successfully
 ninja: no work to do.
-Time to load fused_adam op: 0.03549528121948242 seconds
-Time to load fused_adam op: 0.10125446319580078 seconds
 ***** Running training *****
 ***** Evaluating at the beginning *****
-***** Evaluating at epoch 1/4 *****
-***** Evaluating at epoch 2/4 *****
-***** Evaluating at epoch 3/4 *****
-***** Evaluating at epoch 4/4 *****
-Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-100" ...
 Saving Hugging Face Checkpoints...
 Model saved!
-[2025-05-09 08:48:57,409] [INFO] [launch.py:351:main] Process 1843343 exits successfully.
-[2025-05-09 08:49:01,410] [INFO] [launch.py:351:main] Process 1843342 exits successfully.

 CONDA_PREFIX   : /home/panda/miniconda3/envs/pda
 deepspeed:    /home/panda/miniconda3/envs/pda/bin/deepspeed
 --------------------------------------------
+[2025-05-11 13:53:40,343] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-11 13:53:42,372] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
 Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
+[2025-05-11 13:53:42,373] [INFO] [runner.py:605:main] cmd = /home/panda/miniconda3/envs/pda/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=33558 --module --enable_each_rank_log=None safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft true --max_length 4096 --trust_remote_code True --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 50 --resilient_coeff 1 --lora_r 4 --lora_alpha 32 --lora_dropout 0.05 --gradient_checkpointing --bf16 True --fp16 False --tf32 False
+[2025-05-11 13:53:43,535] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-11 13:53:45,544] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
+[2025-05-11 13:53:45,544] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
+[2025-05-11 13:53:45,544] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
+[2025-05-11 13:53:45,544] [INFO] [launch.py:164:main] dist_world_size=2
+[2025-05-11 13:53:45,544] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
+[2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151596 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=0', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
+[2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151597 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=1', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
+[2025-05-11 13:53:46,697] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-11 13:53:46,717] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-11 13:53:49,799] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-11 13:53:49,817] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-11 13:53:49,817] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
 Set logger level to WARNING.
 calculating baseline ...
 calculating baseline ...
+Computing baseline logprobs...
 ninja: no work to do.
+Time to load fused_adam op: 0.034948110580444336 seconds
+Saving computed baseline logprobs to /home/panda/pda-llm/cache/sft-tools/cached_baseline_logprobs.pt
+Saved baseline logprobs successfully
+ninja: no work to do.
+Time to load fused_adam op: 0.0341794490814209 seconds
 ***** Running training *****
 ***** Evaluating at the beginning *****
+***** Evaluating at epoch 1/3 *****
+***** Evaluating at epoch 2/3 *****
+***** Evaluating at epoch 3/3 *****
+Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
 Saving Hugging Face Checkpoints...
+[2025-05-11 17:31:35,204] [INFO] [launch.py:351:main] Process 1151597 exits successfully.
 Model saved!
+[2025-05-11 17:31:39,205] [INFO] [launch.py:351:main] Process 1151596 exits successfully.

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65b66ccdde0ce45c83b06f31e9f11272cade6aff26ec3cc9d6c49ede82b3ee2d
 size 17210383

 version https://git-lfs.github.com/spec/v1
+oid sha256:089fcc22ecea628694d8dcd8b57815f68d90d070e4012c9964a622d4473b14db
 size 17210383

tokenizer_config.json CHANGED Viewed

@@ -2074,7 +2074,7 @@
     "input_ids",
     "attention_mask"
   ],
-  "model_max_length": 2048,
   "pad_token": "<pad>",
   "padding_side": "right",
   "tokenizer_class": "PreTrainedTokenizer",

     "input_ids",
     "attention_mask"
   ],
+  "model_max_length": 4096,
   "pad_token": "<pad>",
   "padding_side": "right",
   "tokenizer_class": "PreTrainedTokenizer",

wandb/debug-internal.log CHANGED Viewed

@@ -1,18 +1,18 @@
-{"time":"2025-05-09T04:39:40.712021704-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug-core.log"}
-{"time":"2025-05-09T04:39:40.918633322-04:00","level":"INFO","msg":"created new stream","id":"6qw6u685"}
-{"time":"2025-05-09T04:39:40.918678813-04:00","level":"INFO","msg":"stream: started","id":"6qw6u685"}
-{"time":"2025-05-09T04:39:40.918742684-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"6qw6u685"}
-{"time":"2025-05-09T04:39:40.918770585-04:00","level":"INFO","msg":"handler: started","stream_id":"6qw6u685"}
-{"time":"2025-05-09T04:39:40.918834586-04:00","level":"INFO","msg":"sender: started","stream_id":"6qw6u685"}
-{"time":"2025-05-09T04:39:41.131260096-04:00","level":"INFO","msg":"Starting system monitor"}
-{"time":"2025-05-09T04:39:41.131298987-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
-{"time":"2025-05-09T04:39:41.131358468-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
-{"time":"2025-05-09T08:48:57.268396663-04:00","level":"INFO","msg":"Stopping system monitor"}
-{"time":"2025-05-09T08:48:57.268495575-04:00","level":"INFO","msg":"Stopped system monitor"}
-{"time":"2025-05-09T08:48:57.766542844-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
-{"time":"2025-05-09T08:48:57.905789479-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
-{"time":"2025-05-09T08:48:57.924367836-04:00","level":"INFO","msg":"stream: closing","id":"6qw6u685"}
-{"time":"2025-05-09T08:48:57.924388046-04:00","level":"INFO","msg":"handler: closed","stream_id":"6qw6u685"}
-{"time":"2025-05-09T08:48:57.924400406-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"6qw6u685"}
-{"time":"2025-05-09T08:48:57.924434197-04:00","level":"INFO","msg":"sender: closed","stream_id":"6qw6u685"}
-{"time":"2025-05-09T08:48:57.924510528-04:00","level":"INFO","msg":"stream: closed","id":"6qw6u685"}

+{"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
+{"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
+{"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
+{"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}

wandb/debug.log CHANGED Viewed

@@ -1,29 +1,29 @@
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_setup.py:_flush():68] Configure stats pid to 1843342
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_setup.py:_flush():68] Loading settings from environment variables
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug.log
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-100/wandb/run-20250509_043940-6qw6u685/logs/debug-internal.log
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:init():852] calling init triggers
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
-config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 2048, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 100.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 4, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-100', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-100', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-09-04-39-40', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 118, 'total_training_steps': 472, '_wandb': {}}
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:init():893] starting backend
-2025-05-09 04:39:40,707 INFO    MainThread:1843342 [wandb_init.py:init():897] sending inform_init request
-2025-05-09 04:39:40,709 INFO    MainThread:1843342 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
-2025-05-09 04:39:40,709 INFO    MainThread:1843342 [wandb_init.py:init():907] backend started and connected
-2025-05-09 04:39:40,711 INFO    MainThread:1843342 [wandb_init.py:init():1002] updated telemetry
-2025-05-09 04:39:40,717 INFO    MainThread:1843342 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
-2025-05-09 04:39:41,128 INFO    MainThread:1843342 [wandb_init.py:init():1101] starting run threads in backend
-2025-05-09 04:39:41,193 INFO    MainThread:1843342 [wandb_run.py:_console_start():2566] atexit reg
-2025-05-09 04:39:41,193 INFO    MainThread:1843342 [wandb_run.py:_redirect():2414] redirect: wrap_raw
-2025-05-09 04:39:41,193 INFO    MainThread:1843342 [wandb_run.py:_redirect():2483] Wrapping output streams.
-2025-05-09 04:39:41,193 INFO    MainThread:1843342 [wandb_run.py:_redirect():2506] Redirects installed.
-2025-05-09 04:39:41,194 INFO    MainThread:1843342 [wandb_init.py:init():1147] run started, returning control to user process
-2025-05-09 08:48:57,266 INFO    MainThread:1843342 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/6qw6u685
-2025-05-09 08:48:57,267 INFO    MainThread:1843342 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
-2025-05-09 08:48:57,267 INFO    MainThread:1843342 [wandb_run.py:_restore():2513] restore
-2025-05-09 08:48:57,267 INFO    MainThread:1843342 [wandb_run.py:_restore():2519] restore done
-2025-05-09 08:48:57,911 INFO    MainThread:1843342 [wandb_run.py:_footer_history_summary_info():4160] rendering history
-2025-05-09 08:48:57,914 INFO    MainThread:1843342 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
-2025-05-09 08:48:57,923 INFO    MainThread:1843342 [wandb_run.py:_footer_sync_info():4121] logging synced files

+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():852] calling init triggers
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():893] starting backend
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
+2025-05-11 14:32:04,250 INFO    MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-11 14:32:04,250 INFO    MainThread:1151596 [wandb_init.py:init():907] backend started and connected
+2025-05-11 14:32:04,252 INFO    MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
+2025-05-11 14:32:04,257 INFO    MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
+2025-05-11 14:32:04,568 INFO    MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
+2025-05-11 14:32:04,636 INFO    MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
+2025-05-11 14:32:04,638 INFO    MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
+2025-05-11 17:31:35,325 INFO    MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
+2025-05-11 17:31:35,326 INFO    MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
+2025-05-11 17:31:35,327 INFO    MainThread:1151596 [wandb_run.py:_restore():2513] restore
+2025-05-11 17:31:35,327 INFO    MainThread:1151596 [wandb_run.py:_restore():2519] restore done
+2025-05-11 17:31:35,957 INFO    MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
+2025-05-11 17:31:35,964 INFO    MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
+2025-05-11 17:31:35,968 INFO    MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files

wandb/run-20250511_143204-ws6emydu/files/config.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+_wandb:
+    value:
+        cli_version: 0.19.10
+        m: []
+        python_version: 3.11.11
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+                - 51
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 11
+                - 49
+                - 51
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.11.11
+            "5": 0.19.10
+            "6": 4.51.3
+            "8":
+                - 5
+            "12": 0.19.10
+            "13": linux-x86_64
+bf16:
+    value: true
+cache_dir:
+    value: /home/panda/pda-llm/cache/sft-tools
+deepscale:
+    value: false
+deepscale_config:
+    value: null
+deepspeed:
+    value: false
+deepspeed_config:
+    value: null
+device:
+    value: cuda:0
+epochs:
+    value: 3
+eval_datasets:
+    value: null
+eval_interval:
+    value: 1000000
+eval_split_ratio:
+    value: null
+eval_strategy:
+    value: epoch
+fp16:
+    value: false
+global_rank:
+    value: 0
+gradient_accumulation_steps:
+    value: 48
+gradient_checkpointing:
+    value: true
+important_sft:
+    value: true
+local_rank:
+    value: 0
+log_dir:
+    value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
+log_project:
+    value: TOOLS-SFT
+log_run_name:
+    value: tools-sft-2025-05-11-14-32-03
+log_type:
+    value: wandb
+lora_alpha:
+    value: 32
+lora_dropout:
+    value: 0.05
+lora_r:
+    value: 4
+lr:
+    value: 0.0001
+lr_scheduler_type:
+    value: COSINE
+lr_warmup_ratio:
+    value: 0.1
+max_length:
+    value: 4096
+model_name_or_path:
+    value: meta-llama/Llama-3.1-8B-Instruct
+need_eval:
+    value: true
+num_update_steps_per_epoch:
+    value: 112
+offload:
+    value: none
+output_dir:
+    value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
+per_device_eval_batch_size:
+    value: 1
+per_device_train_batch_size:
+    value: 1
+recompute_baseline:
+    value: false
+resilient_coeff:
+    value: 1
+safety_ratio_tol:
+    value: 50
+save_16bit:
+    value: false
+save_interval:
+    value: 1000000
+seed:
+    value: 42
+tf32:
+    value: false
+total_training_steps:
+    value: 336
+train_datasets:
+    value:
+        - - tools
+          - proportion: 1
+trust_remote_code:
+    value: true
+weight_decay:
+    value: 0
+zero_stage:
+    value: 0

wandb/run-20250511_143204-ws6emydu/files/output.log ADDED Viewed

	@@ -0,0 +1,15 @@

+***** Running training *****
+Training 3/3 epoch (loss 0.4680): 100%|██████████| 16050/16050 [2:59:28<00:00,  1.49it/s]
+***** Evaluating at the beginning *****
+***** Evaluating at epoch 1/3 *****
+***** Evaluating at epoch 2/3 *****
+***** Evaluating at epoch 3/3 *****
+Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
+Saving Hugging Face Checkpoints...
+/home/panda/miniconda3/envs/pda/lib/python3.11/site-packages/peft/utils/save_and_load.py:220: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
+  warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
+Model saved!

wandb/run-20250511_143204-ws6emydu/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,149 @@

+PySocks==1.7.1
+pip==25.1.1
+certifi==2025.4.26
+parso==0.8.4
+wcwidth==0.2.13
+nvidia-ml-py==12.535.133
+mkl_fft==1.3.11
+urllib3==2.3.0
+charset-normalizer==3.3.2
+transformers==4.51.3
+smmap==4.0.0
+xxhash==3.5.0
+etils==1.12.2
+platformdirs==4.3.8
+tzdata==2025.2
+ipython==9.2.0
+deepspeed==0.16.7
+gmpy2==2.2.1
+importlib_resources==6.5.2
+pydantic==2.10.3
+tqdm==4.67.1
+pyparsing==3.2.0
+exceptiongroup==1.2.2
+tokenizers==0.21.1
+Markdown==3.8
+matplotlib-inline==0.1.7
+dm_control==1.0.30
+jupyter_core==5.7.2
+pydantic_core==2.27.1
+hjson==3.1.0
+mkl_random==1.2.8
+ipykernel==6.29.5
+PyOpenGL==3.1.9
+sentry-sdk==2.18.0
+wandb==0.19.10
+einops==0.8.1
+prompt_toolkit==3.0.51
+Bottleneck==1.4.2
+msgpack==1.1.0
+fsspec==2024.12.0
+labmaze==1.0.6
+mdurl==0.1.0
+executing==2.2.0
+wheel==0.45.1
+bitsandbytes==0.45.5
+networkx==3.4.2
+comm==0.2.2
+asttokens==3.0.0
+nvitop==1.5.0
+sympy==1.13.1
+MarkupSafe==3.0.2
+numexpr==2.10.1
+tensorboard_data_server==0.7.0
+markdown-it-py==2.2.0
+wrapt==1.17.2
+six==1.17.0
+idna==3.7
+docker-pycreds==0.4.0
+jedi==0.19.2
+GitPython==3.1.43
+dm-env==1.6
+pure_eval==0.2.3
+ninja==1.11.1.4
+setuptools==80.1.0
+fonttools==4.55.3
+annotated-types==0.6.0
+psutil==7.0.0
+cycler==0.11.0
+py-cpuinfo==9.0.0
+accelerate==1.6.0
+dm-tree==0.1.9
+filelock==3.17.0
+pytz==2024.1
+rich==13.9.4
+appdirs==1.4.4
+click==8.1.8
+Jinja2==3.1.6
+unicodedata2==15.1.0
+pillow==11.1.0
+glfw==2.9.0
+traitlets==5.14.3
+stack_data==0.6.3
+peft==0.15.2
+python-dateutil==2.9.0.post0
+requests==2.32.3
+attrs==25.3.0
+Werkzeug==3.1.3
+gitdb==4.0.7
+lxml==5.4.0
+torch==2.5.1
+scipy==1.15.2
+sentencepiece==0.2.0
+huggingface-hub==0.31.1
+dill==0.3.8
+pexpect==4.9.0
+pickleshare==0.7.5
+ptyprocess==0.7.0
+contourpy==1.3.1
+grpcio==1.71.0
+optree==0.14.1
+safetensors==0.5.3
+mpmath==1.3.0
+nest_asyncio==1.6.0
+pyarrow==19.0.0
+PyYAML==6.0.2
+decorator==5.2.1
+pandas==2.2.3
+tensorboard==2.19.0
+zipp==3.21.0
+mujoco==3.3.2
+regex==2024.11.6
+Brotli==1.0.9
+mkl-service==2.4.0
+matplotlib==3.10.0
+Pygments==2.19.1
+absl-py==2.1.0
+hf-xet==1.1.0
+packaging==25.0
+jupyter_client==8.6.3
+triton==3.1.0
+multiprocess==0.70.15
+debugpy==1.8.14
+numpy==2.0.1
+protobuf==5.29.3
+tornado==6.4.2
+datasets==3.6.0
+eval_type_backport==0.2.2
+typing_extensions==4.12.2
+kiwisolver==1.4.8
+pyzmq==26.4.0
+ipython_pygments_lexers==1.1.1
+setproctitle==1.2.2
+importlib_metadata==8.6.1
+jaraco.text==3.12.1
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+more-itertools==10.3.0
+wheel==0.45.1
+jaraco.collections==5.1.0
+platformdirs==4.2.2
+autocommand==2.2.2
+zipp==3.19.2
+tomli==2.0.1
+typeguard==4.3.0
+packaging==24.2
+inflect==7.3.1
+typing_extensions==4.12.2
+jaraco.functools==4.0.1

wandb/run-20250511_143204-ws6emydu/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "os":  "Linux-5.15.0-138-generic-x86_64-with-glibc2.31",
+  "python":  "CPython 3.11.11",
+  "startedAt":  "2025-05-11T18:32:04.251126Z",
+  "args":  [
+    "--local_rank=0",
+    "--train_datasets",
+    "tools",
+    "--model_name_or_path",
+    "meta-llama/Llama-3.1-8B-Instruct",
+    "--cache_dir",
+    "/home/panda/pda-llm/cache/sft-tools",
+    "--important_sft",
+    "true",
+    "--max_length",
+    "4096",
+    "--trust_remote_code",
+    "True",
+    "--epochs",
+    "3",
+    "--per_device_train_batch_size",
+    "1",
+    "--per_device_eval_batch_size",
+    "1",
+    "--gradient_accumulation_steps",
+    "48",
+    "--gradient_checkpointing",
+    "--learning_rate",
+    "1e-4",
+    "--lr_scheduler_type",
+    "cosine",
+    "--lr_warmup_ratio",
+    "0.1",
+    "--weight_decay",
+    "0.0",
+    "--seed",
+    "42",
+    "--output_dir",
+    "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
+    "--log_type",
+    "wandb",
+    "--log_project",
+    "TOOLS-SFT",
+    "--zero_stage",
+    "0",
+    "--offload",
+    "none",
+    "--safety_ratio_tol",
+    "50",
+    "--resilient_coeff",
+    "1",
+    "--lora_r",
+    "4",
+    "--lora_alpha",
+    "32",
+    "--lora_dropout",
+    "0.05",
+    "--gradient_checkpointing",
+    "--bf16",
+    "True",
+    "--fp16",
+    "False",
+    "--tf32",
+    "False"
+  ],
+  "program":  "-m safe_rlhf.algorithms.tools_ft.__main__",
+  "git":  {
+    "remote":  "git@github.com:IgnacioBoero/pda-llm.git",
+    "commit":  "f4c70e99280e869cd565464446c9942daaf22425"
+  },
+  "email":  "iboero@upenn.edu",
+  "root":  "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
+  "host":  "plaza",
+  "executable":  "/home/panda/miniconda3/envs/pda/bin/python",
+  "cpu_count":  24,
+  "cpu_count_logical":  48,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  2,
+  "disk":  {
+    "/":  {
+      "total":  "982820896768",
+      "used":  "915338317824"
+    }
+  },
+  "memory":  {
+    "total":  "270090010624"
+  },
+  "cpu":  {
+    "count":  24,
+    "countLogical":  48
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20250511_143204-ws6emydu/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_runtime":10771.076681318,"_step":16050,"train/step":16050,"eval/hist_log_ratio":{"values":[1,1,2,0,2,0,1,2,0,3,4,5,10,4,2,7,5,6,2,3,6,9,6,6,13,5,1,2,8,5,6,6,4,5,4,6,3,2,4,4,3,0,0,4,1,2,4,2,0,0,3,0,3,2,4,0,3,1,0,0,1,1,0,1],"bins":[77.5,81.5,85.5,89.5,93.5,97.5,101.5625,105.5625,109.5625,113.5625,117.5625,121.625,125.625,129.625,133.625,137.625,141.625,145.625,149.625,153.625,157.625,161.75,165.75,169.75,173.75,177.75,181.75,185.75,189.75,193.75,197.75,201.75,205.75,209.75,213.75,217.75,221.75,225.75,229.75,233.75,237.75,241.875,245.875,249.875,253.875,258,262,266,270,274,278,282,286,290,294,298,302,306,310,314,318,322,326,330,334],"_type":"histogram"},"train/loss":0.4680471122264862,"eval/step":16050,"eval/min_log_ratio":77.5,"eval/max_log_ratio":334,"_timestamp":1.7469990929702783e+09,"_wandb":{"runtime":10771},"train/lr":1.0749763901607423e-08,"eval/mean_important_log_ratio":188,"train/epoch":3}

wandb/run-20250511_143204-ws6emydu/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,15 @@

+{"time":"2025-05-11T14:32:03.899110462-04:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmplwnn6mri/port-1151596.txt","pid":1151596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
+{"time":"2025-05-11T14:32:03.905019894-04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1151596}
+{"time":"2025-05-11T14:32:03.904995063-04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35011,"Zone":""}}
+{"time":"2025-05-11T14:32:04.083172335-04:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T14:32:04.25319672-04:00","level":"INFO","msg":"handleInformInit: received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T14:32:04.439955459-04:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"ws6emydu","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:35.968564433-04:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:35.968735567-04:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"ws6emydu","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.95829108-04:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.958316571-04:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.958328081-04:00","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-05-11T17:31:36.958329541-04:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.958416902-04:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.958428293-04:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:37156"}
+{"time":"2025-05-11T17:31:36.958438183-04:00","level":"INFO","msg":"server is closed"}

wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,18 @@

+{"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
+{"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
+{"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
+{"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
+{"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
+{"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
+{"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}

wandb/run-20250511_143204-ws6emydu/logs/debug.log ADDED Viewed

	@@ -0,0 +1,29 @@

+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():852] calling init triggers
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():893] starting backend
+2025-05-11 14:32:04,248 INFO    MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
+2025-05-11 14:32:04,250 INFO    MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-11 14:32:04,250 INFO    MainThread:1151596 [wandb_init.py:init():907] backend started and connected
+2025-05-11 14:32:04,252 INFO    MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
+2025-05-11 14:32:04,257 INFO    MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
+2025-05-11 14:32:04,568 INFO    MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
+2025-05-11 14:32:04,636 INFO    MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
+2025-05-11 14:32:04,637 INFO    MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
+2025-05-11 14:32:04,638 INFO    MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
+2025-05-11 17:31:35,325 INFO    MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
+2025-05-11 17:31:35,326 INFO    MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
+2025-05-11 17:31:35,327 INFO    MainThread:1151596 [wandb_run.py:_restore():2513] restore
+2025-05-11 17:31:35,327 INFO    MainThread:1151596 [wandb_run.py:_restore():2519] restore done
+2025-05-11 17:31:35,957 INFO    MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
+2025-05-11 17:31:35,964 INFO    MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
+2025-05-11 17:31:35,968 INFO    MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files

wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d75c95736964eb77dd5d326d209df011e412e887998fb330d51e0946a94f7cc4
+size 14008017