Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- adapter_config.json +6 -6
- adapter_model.safetensors +2 -2
- arguments.json +9 -9
- arguments.pkl +2 -2
- config.json +15 -0
- environ.txt +18 -14
- script.sh +20 -3
- stderr.log +0 -0
- stdout.log +28 -28
- tokenizer.json +1 -1
- tokenizer_config.json +1 -1
- wandb/debug-internal.log +18 -18
- wandb/debug.log +29 -29
- wandb/run-20250511_143204-ws6emydu/files/config.yaml +134 -0
- wandb/run-20250511_143204-ws6emydu/files/output.log +15 -0
- wandb/run-20250511_143204-ws6emydu/files/requirements.txt +149 -0
- wandb/run-20250511_143204-ws6emydu/files/wandb-metadata.json +107 -0
- wandb/run-20250511_143204-ws6emydu/files/wandb-summary.json +1 -0
- wandb/run-20250511_143204-ws6emydu/logs/debug-core.log +15 -0
- wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log +18 -0
- wandb/run-20250511_143204-ws6emydu/logs/debug.log +29 -0
- wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb +3 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
wandb/run-20250508_194038-uq5zwcwz/run-uq5zwcwz.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
wandb/run-20250509_043940-6qw6u685/run-6qw6u685.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb filter=lfs diff=lfs merge=lfs -text
|
adapter_config.json
CHANGED
|
@@ -23,18 +23,18 @@
|
|
| 23 |
"megatron_core": "megatron.core",
|
| 24 |
"modules_to_save": null,
|
| 25 |
"peft_type": "LORA",
|
| 26 |
-
"r":
|
| 27 |
"rank_pattern": {},
|
| 28 |
"revision": null,
|
| 29 |
"target_modules": [
|
|
|
|
|
|
|
| 30 |
"down_proj",
|
| 31 |
"lm_head",
|
| 32 |
-
"gate_proj",
|
| 33 |
-
"v_proj",
|
| 34 |
-
"k_proj",
|
| 35 |
-
"up_proj",
|
| 36 |
"o_proj",
|
| 37 |
-
"
|
|
|
|
|
|
|
| 38 |
],
|
| 39 |
"task_type": null,
|
| 40 |
"trainable_token_indices": null,
|
|
|
|
| 23 |
"megatron_core": "megatron.core",
|
| 24 |
"modules_to_save": null,
|
| 25 |
"peft_type": "LORA",
|
| 26 |
+
"r": 4,
|
| 27 |
"rank_pattern": {},
|
| 28 |
"revision": null,
|
| 29 |
"target_modules": [
|
| 30 |
+
"v_proj",
|
| 31 |
+
"gate_proj",
|
| 32 |
"down_proj",
|
| 33 |
"lm_head",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"o_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"q_proj",
|
| 37 |
+
"k_proj"
|
| 38 |
],
|
| 39 |
"task_type": null,
|
| 40 |
"trainable_token_indices": null,
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ceb99c3a3c514adbaf1adaf7b3d1cfecc7c2f0b42aa5a3066b47d9b60c54536
|
| 3 |
+
size 1072781800
|
arguments.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
|
| 3 |
"recompute_baseline": false,
|
| 4 |
"cache_dir": "/home/panda/pda-llm/cache/sft-tools",
|
| 5 |
-
"max_length":
|
| 6 |
"trust_remote_code": true,
|
| 7 |
"train_datasets": [
|
| 8 |
[
|
|
@@ -13,10 +13,10 @@
|
|
| 13 |
]
|
| 14 |
],
|
| 15 |
"eval_datasets": null,
|
| 16 |
-
"safety_ratio_tol":
|
| 17 |
"important_sft": true,
|
| 18 |
"resilient_coeff": 1.0,
|
| 19 |
-
"epochs":
|
| 20 |
"per_device_train_batch_size": 1,
|
| 21 |
"per_device_eval_batch_size": 1,
|
| 22 |
"gradient_accumulation_steps": 48,
|
|
@@ -29,18 +29,18 @@
|
|
| 29 |
"fp16": false,
|
| 30 |
"bf16": true,
|
| 31 |
"tf32": false,
|
| 32 |
-
"lora_r":
|
| 33 |
"lora_alpha": 32,
|
| 34 |
"lora_dropout": 0.05,
|
| 35 |
"eval_strategy": "epoch",
|
| 36 |
"eval_interval": 1000000,
|
| 37 |
"need_eval": true,
|
| 38 |
"eval_split_ratio": null,
|
| 39 |
-
"output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-
|
| 40 |
"log_type": "wandb",
|
| 41 |
-
"log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-
|
| 42 |
"log_project": "TOOLS-SFT",
|
| 43 |
-
"log_run_name": "tools-sft-2025-05-
|
| 44 |
"save_16bit": false,
|
| 45 |
"save_interval": 1000000,
|
| 46 |
"local_rank": 0,
|
|
@@ -55,6 +55,6 @@
|
|
| 55 |
"type": "torch.device",
|
| 56 |
"repr": "device(type='cuda', index=0)"
|
| 57 |
},
|
| 58 |
-
"num_update_steps_per_epoch":
|
| 59 |
-
"total_training_steps":
|
| 60 |
}
|
|
|
|
| 2 |
"model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
|
| 3 |
"recompute_baseline": false,
|
| 4 |
"cache_dir": "/home/panda/pda-llm/cache/sft-tools",
|
| 5 |
+
"max_length": 4096,
|
| 6 |
"trust_remote_code": true,
|
| 7 |
"train_datasets": [
|
| 8 |
[
|
|
|
|
| 13 |
]
|
| 14 |
],
|
| 15 |
"eval_datasets": null,
|
| 16 |
+
"safety_ratio_tol": 50.0,
|
| 17 |
"important_sft": true,
|
| 18 |
"resilient_coeff": 1.0,
|
| 19 |
+
"epochs": 3,
|
| 20 |
"per_device_train_batch_size": 1,
|
| 21 |
"per_device_eval_batch_size": 1,
|
| 22 |
"gradient_accumulation_steps": 48,
|
|
|
|
| 29 |
"fp16": false,
|
| 30 |
"bf16": true,
|
| 31 |
"tf32": false,
|
| 32 |
+
"lora_r": 4,
|
| 33 |
"lora_alpha": 32,
|
| 34 |
"lora_dropout": 0.05,
|
| 35 |
"eval_strategy": "epoch",
|
| 36 |
"eval_interval": 1000000,
|
| 37 |
"need_eval": true,
|
| 38 |
"eval_split_ratio": null,
|
| 39 |
+
"output_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
|
| 40 |
"log_type": "wandb",
|
| 41 |
+
"log_dir": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
|
| 42 |
"log_project": "TOOLS-SFT",
|
| 43 |
+
"log_run_name": "tools-sft-2025-05-11-14-32-03",
|
| 44 |
"save_16bit": false,
|
| 45 |
"save_interval": 1000000,
|
| 46 |
"local_rank": 0,
|
|
|
|
| 55 |
"type": "torch.device",
|
| 56 |
"repr": "device(type='cuda', index=0)"
|
| 57 |
},
|
| 58 |
+
"num_update_steps_per_epoch": 112,
|
| 59 |
+
"total_training_steps": 336
|
| 60 |
}
|
arguments.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d44fd7cf7fb68b61cf52e153fc0789f3d95aa166ebbb56b1832e0b98a1536149
|
| 3 |
+
size 1231
|
config.json
CHANGED
|
@@ -20,6 +20,21 @@
|
|
| 20 |
"num_key_value_heads": 8,
|
| 21 |
"pad_token_id": 128256,
|
| 22 |
"pretraining_tp": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"rms_norm_eps": 1e-05,
|
| 24 |
"rope_scaling": {
|
| 25 |
"factor": 8.0,
|
|
|
|
| 20 |
"num_key_value_heads": 8,
|
| 21 |
"pad_token_id": 128256,
|
| 22 |
"pretraining_tp": 1,
|
| 23 |
+
"quantization_config": {
|
| 24 |
+
"_load_in_4bit": true,
|
| 25 |
+
"_load_in_8bit": false,
|
| 26 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 27 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 28 |
+
"bnb_4bit_quant_type": "nf4",
|
| 29 |
+
"bnb_4bit_use_double_quant": true,
|
| 30 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 31 |
+
"llm_int8_has_fp16_weight": false,
|
| 32 |
+
"llm_int8_skip_modules": null,
|
| 33 |
+
"llm_int8_threshold": 6.0,
|
| 34 |
+
"load_in_4bit": true,
|
| 35 |
+
"load_in_8bit": false,
|
| 36 |
+
"quant_method": "bitsandbytes"
|
| 37 |
+
},
|
| 38 |
"rms_norm_eps": 1e-05,
|
| 39 |
"rope_scaling": {
|
| 40 |
"factor": 8.0,
|
environ.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
BROWSER=/home/panda/.vscode-server/cli/servers/Stable-
|
|
|
|
| 2 |
COLORTERM=truecolor
|
| 3 |
CONDA_DEFAULT_ENV=pda
|
| 4 |
CONDA_EXE=/home/panda/miniconda3/bin/conda
|
|
@@ -6,13 +7,14 @@ CONDA_PREFIX=/home/panda/miniconda3/envs/pda
|
|
| 6 |
CONDA_PREFIX_1=/home/panda/miniconda3
|
| 7 |
CONDA_PROMPT_MODIFIER=(pda)
|
| 8 |
CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
|
|
|
|
| 9 |
CONDA_SHLVL=2
|
| 10 |
CROSS_RANK=0
|
| 11 |
CROSS_SIZE=1
|
| 12 |
CUDA_MODULE_LOADING=LAZY
|
| 13 |
CUDA_VISIBLE_DEVICES=0,1
|
| 14 |
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
|
| 15 |
-
GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-
|
| 16 |
HOME=/home/panda
|
| 17 |
LANG=en_US.UTF-8
|
| 18 |
LOCAL_RANK=0
|
|
@@ -20,39 +22,41 @@ LOCAL_SIZE=2
|
|
| 20 |
LOGLEVEL=WARNING
|
| 21 |
LOGNAME=panda
|
| 22 |
MASTER_ADDR=127.0.0.1
|
| 23 |
-
MASTER_PORT=
|
| 24 |
MOTD_SHOWN=pam
|
| 25 |
-
OLDPWD=/home/panda
|
| 26 |
-
PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-
|
| 27 |
PWD=/home/panda/pda-llm/scripts
|
|
|
|
| 28 |
PYTHONHASHSEED=42
|
| 29 |
PYTHONPATH=/home/panda/pda-llm
|
| 30 |
RANK=0
|
| 31 |
SHELL=/bin/sh
|
| 32 |
SHLVL=4
|
| 33 |
-
SSH_CLIENT=10.
|
| 34 |
-
SSH_CONNECTION=10.
|
| 35 |
SSL_CERT_DIR=/usr/lib/ssl/certs
|
| 36 |
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
| 37 |
TERM=screen
|
| 38 |
TERM_PROGRAM=vscode
|
| 39 |
-
TERM_PROGRAM_VERSION=1.
|
| 40 |
-
TMUX=/tmp/
|
| 41 |
TMUX_PANE=%0
|
| 42 |
USER=panda
|
|
|
|
| 43 |
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
| 44 |
-
VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-
|
| 45 |
-
VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-
|
| 46 |
VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
|
| 47 |
-
VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-
|
| 48 |
WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
|
| 49 |
WANDB_ENTITY=alelab
|
| 50 |
WANDB_MODE=online
|
| 51 |
-
WANDB_SERVICE=2-
|
| 52 |
WORLD_SIZE=2
|
| 53 |
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
| 54 |
XDG_RUNTIME_DIR=/run/user/1008
|
| 55 |
XDG_SESSION_CLASS=user
|
| 56 |
-
XDG_SESSION_ID=
|
| 57 |
XDG_SESSION_TYPE=tty
|
| 58 |
_=/home/panda/miniconda3/envs/pda/bin/deepspeed
|
|
|
|
| 1 |
+
BROWSER=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/helpers/browser.sh
|
| 2 |
+
BUNDLED_DEBUGPY_PATH=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/libs/debugpy
|
| 3 |
COLORTERM=truecolor
|
| 4 |
CONDA_DEFAULT_ENV=pda
|
| 5 |
CONDA_EXE=/home/panda/miniconda3/bin/conda
|
|
|
|
| 7 |
CONDA_PREFIX_1=/home/panda/miniconda3
|
| 8 |
CONDA_PROMPT_MODIFIER=(pda)
|
| 9 |
CONDA_PYTHON_EXE=/home/panda/miniconda3/bin/python
|
| 10 |
+
CONDA_ROOT=/home/panda/miniconda3
|
| 11 |
CONDA_SHLVL=2
|
| 12 |
CROSS_RANK=0
|
| 13 |
CROSS_SIZE=1
|
| 14 |
CUDA_MODULE_LOADING=LAZY
|
| 15 |
CUDA_VISIBLE_DEVICES=0,1
|
| 16 |
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1008/bus
|
| 17 |
+
GIT_ASKPASS=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass.sh
|
| 18 |
HOME=/home/panda
|
| 19 |
LANG=en_US.UTF-8
|
| 20 |
LOCAL_RANK=0
|
|
|
|
| 22 |
LOGLEVEL=WARNING
|
| 23 |
LOGNAME=panda
|
| 24 |
MASTER_ADDR=127.0.0.1
|
| 25 |
+
MASTER_PORT=33558
|
| 26 |
MOTD_SHOWN=pam
|
| 27 |
+
OLDPWD=/home/panda/pda-llm
|
| 28 |
+
PATH=/home/panda/miniconda3/envs/pda/bin:/home/panda/miniconda3/condabin:/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/bin/remote-cli:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/bundled/scripts/noConfigScripts
|
| 29 |
PWD=/home/panda/pda-llm/scripts
|
| 30 |
+
PYDEVD_DISABLE_FILE_VALIDATION=1
|
| 31 |
PYTHONHASHSEED=42
|
| 32 |
PYTHONPATH=/home/panda/pda-llm
|
| 33 |
RANK=0
|
| 34 |
SHELL=/bin/sh
|
| 35 |
SHLVL=4
|
| 36 |
+
SSH_CLIENT=10.103.69.12 60984 22
|
| 37 |
+
SSH_CONNECTION=10.103.69.12 60984 158.130.110.127 22
|
| 38 |
SSL_CERT_DIR=/usr/lib/ssl/certs
|
| 39 |
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
| 40 |
TERM=screen
|
| 41 |
TERM_PROGRAM=vscode
|
| 42 |
+
TERM_PROGRAM_VERSION=1.96.4
|
| 43 |
+
TMUX=/tmp/tmux-1008/default,1150618,0
|
| 44 |
TMUX_PANE=%0
|
| 45 |
USER=panda
|
| 46 |
+
VSCODE_DEBUGPY_ADAPTER_ENDPOINTS=/home/panda/.vscode-server/extensions/ms-python.debugpy-2025.8.0-linux-x64/.noConfigDebugAdapterEndpoints/endpoint-c660b802f00341f2.txt
|
| 47 |
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
| 48 |
+
VSCODE_GIT_ASKPASS_MAIN=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/extensions/git/dist/askpass-main.js
|
| 49 |
+
VSCODE_GIT_ASKPASS_NODE=/home/panda/.vscode-server/cli/servers/Stable-cd4ee3b1c348a13bafd8f9ad8060705f6d4b9cba/server/node
|
| 50 |
VSCODE_GIT_IPC_HANDLE=/run/user/1008/vscode-git-3d8733097b.sock
|
| 51 |
+
VSCODE_IPC_HOOK_CLI=/run/user/1008/vscode-ipc-4fc226f5-2ffc-4d5b-b737-82848f81acd2.sock
|
| 52 |
WANDB_API_KEY=6a71e7fad84fe1aa8f6ccaa01e4e02fcf4c7ffb4
|
| 53 |
WANDB_ENTITY=alelab
|
| 54 |
WANDB_MODE=online
|
| 55 |
+
WANDB_SERVICE=2-1151596-tcp-localhost-35011
|
| 56 |
WORLD_SIZE=2
|
| 57 |
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
| 58 |
XDG_RUNTIME_DIR=/run/user/1008
|
| 59 |
XDG_SESSION_CLASS=user
|
| 60 |
+
XDG_SESSION_ID=3272
|
| 61 |
XDG_SESSION_TYPE=tty
|
| 62 |
_=/home/panda/miniconda3/envs/pda/bin/deepspeed
|
script.sh
CHANGED
|
@@ -40,6 +40,8 @@ SAFETY_RATIO_TOL=10
|
|
| 40 |
RESILIENT_COEFF=1
|
| 41 |
LEARNING_RATE=1e-4
|
| 42 |
EPOCHS=3
|
|
|
|
|
|
|
| 43 |
while [[ "$#" -gt 0 ]]; do
|
| 44 |
arg="$1"
|
| 45 |
shift
|
|
@@ -107,6 +109,20 @@ while [[ "$#" -gt 0 ]]; do
|
|
| 107 |
--epochs=*)
|
| 108 |
EPOCHS="${arg#*=}"
|
| 109 |
;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
--important_sft)
|
| 111 |
IMPORTANT_SFT="$1"
|
| 112 |
shift
|
|
@@ -121,7 +137,7 @@ while [[ "$#" -gt 0 ]]; do
|
|
| 121 |
esac
|
| 122 |
done
|
| 123 |
|
| 124 |
-
OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}"
|
| 125 |
mkdir -p "${OUTPUT_DIR}"
|
| 126 |
OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
|
| 127 |
if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
|
|
@@ -170,7 +186,7 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
|
|
| 170 |
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
| 171 |
--cache_dir "${ROOT_DIR}/cache/sft-tools" \
|
| 172 |
--important_sft "${IMPORTANT_SFT}" \
|
| 173 |
-
--max_length
|
| 174 |
--trust_remote_code True \
|
| 175 |
--epochs "${EPOCHS}" \
|
| 176 |
--per_device_train_batch_size 1 \
|
|
@@ -189,9 +205,10 @@ CUDA_VISIBLE_DEVICES=0,1 deepspeed "${DEEPSPEED_ARGS[@]}" \
|
|
| 189 |
--offload "${OFFLOAD}" \
|
| 190 |
--safety_ratio_tol "${SAFETY_RATIO_TOL}" \
|
| 191 |
--resilient_coeff "${RESILIENT_COEFF}" \
|
| 192 |
-
--lora_r "
|
| 193 |
--lora_alpha "32" \
|
| 194 |
--lora_dropout "0.05" \
|
|
|
|
| 195 |
--bf16 True \
|
| 196 |
--fp16 False \
|
| 197 |
--tf32 False
|
|
|
|
| 40 |
RESILIENT_COEFF=1
|
| 41 |
LEARNING_RATE=1e-4
|
| 42 |
EPOCHS=3
|
| 43 |
+
LORA_R=4
|
| 44 |
+
MAX_LENGTH=4096
|
| 45 |
while [[ "$#" -gt 0 ]]; do
|
| 46 |
arg="$1"
|
| 47 |
shift
|
|
|
|
| 109 |
--epochs=*)
|
| 110 |
EPOCHS="${arg#*=}"
|
| 111 |
;;
|
| 112 |
+
--lora_r)
|
| 113 |
+
LORA_R="$1"
|
| 114 |
+
shift
|
| 115 |
+
;;
|
| 116 |
+
--lora_r=*)
|
| 117 |
+
LORA_R="${arg#*=}"
|
| 118 |
+
;;
|
| 119 |
+
--max_length)
|
| 120 |
+
MAX_LENGTH="$1"
|
| 121 |
+
shift
|
| 122 |
+
;;
|
| 123 |
+
--max_length=*)
|
| 124 |
+
MAX_LENGTH="${arg#*=}"
|
| 125 |
+
;;
|
| 126 |
--important_sft)
|
| 127 |
IMPORTANT_SFT="$1"
|
| 128 |
shift
|
|
|
|
| 137 |
esac
|
| 138 |
done
|
| 139 |
|
| 140 |
+
OUTPUT_DIR="${ROOT_DIR}/output/sft-tools/run-${IMPORTANT_SFT}-${RESILIENT_COEFF}-${SAFETY_RATIO_TOL}-${LORA_R}-${MAX_LENGTH}"
|
| 141 |
mkdir -p "${OUTPUT_DIR}"
|
| 142 |
OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
|
| 143 |
if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
|
|
|
|
| 186 |
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
| 187 |
--cache_dir "${ROOT_DIR}/cache/sft-tools" \
|
| 188 |
--important_sft "${IMPORTANT_SFT}" \
|
| 189 |
+
--max_length "${MAX_LENGTH}" \
|
| 190 |
--trust_remote_code True \
|
| 191 |
--epochs "${EPOCHS}" \
|
| 192 |
--per_device_train_batch_size 1 \
|
|
|
|
| 205 |
--offload "${OFFLOAD}" \
|
| 206 |
--safety_ratio_tol "${SAFETY_RATIO_TOL}" \
|
| 207 |
--resilient_coeff "${RESILIENT_COEFF}" \
|
| 208 |
+
--lora_r "${LORA_R}" \
|
| 209 |
--lora_alpha "32" \
|
| 210 |
--lora_dropout "0.05" \
|
| 211 |
+
--gradient_checkpointing \
|
| 212 |
--bf16 True \
|
| 213 |
--fp16 False \
|
| 214 |
--tf32 False
|
stderr.log
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
stdout.log
CHANGED
|
@@ -7,44 +7,44 @@ python version : 3.11.11
|
|
| 7 |
CONDA_PREFIX : /home/panda/miniconda3/envs/pda
|
| 8 |
deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed
|
| 9 |
--------------------------------------------
|
| 10 |
-
[2025-05-
|
| 11 |
-
[2025-05-
|
| 12 |
Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
|
| 13 |
-
[2025-05-
|
| 14 |
-
[2025-05-
|
| 15 |
-
[2025-05-
|
| 16 |
-
[2025-05-
|
| 17 |
-
[2025-05-
|
| 18 |
-
[2025-05-
|
| 19 |
-
[2025-05-
|
| 20 |
-
[2025-05-
|
| 21 |
-
[2025-05-
|
| 22 |
-
[2025-05-
|
| 23 |
-
[2025-05-
|
| 24 |
-
[2025-05-
|
| 25 |
-
[2025-05-
|
| 26 |
-
[2025-05-
|
| 27 |
Set logger level to WARNING.
|
| 28 |
calculating baseline ...
|
| 29 |
calculating baseline ...
|
| 30 |
-
|
| 31 |
-
Loaded cached baseline logprobs successfully
|
| 32 |
ninja: no work to do.
|
| 33 |
-
Time to load fused_adam op: 0.
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
***** Running training *****
|
| 36 |
|
| 37 |
***** Evaluating at the beginning *****
|
| 38 |
|
| 39 |
-
***** Evaluating at epoch 1/
|
| 40 |
-
|
| 41 |
-
***** Evaluating at epoch 2/4 *****
|
| 42 |
|
| 43 |
-
***** Evaluating at epoch
|
| 44 |
|
| 45 |
-
***** Evaluating at epoch
|
| 46 |
-
Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-
|
| 47 |
Saving Hugging Face Checkpoints...
|
|
|
|
| 48 |
Model saved!
|
| 49 |
-
[2025-05-
|
| 50 |
-
[2025-05-09 08:49:01,410] [INFO] [launch.py:351:main] Process 1843342 exits successfully.
|
|
|
|
| 7 |
CONDA_PREFIX : /home/panda/miniconda3/envs/pda
|
| 8 |
deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed
|
| 9 |
--------------------------------------------
|
| 10 |
+
[2025-05-11 13:53:40,343] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 11 |
+
[2025-05-11 13:53:42,372] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
| 12 |
Detected VISIBLE_DEVICES=0,1: setting --include=localhost:0,1
|
| 13 |
+
[2025-05-11 13:53:42,373] [INFO] [runner.py:605:main] cmd = /home/panda/miniconda3/envs/pda/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=33558 --module --enable_each_rank_log=None safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft true --max_length 4096 --trust_remote_code True --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 50 --resilient_coeff 1 --lora_r 4 --lora_alpha 32 --lora_dropout 0.05 --gradient_checkpointing --bf16 True --fp16 False --tf32 False
|
| 14 |
+
[2025-05-11 13:53:43,535] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 15 |
+
[2025-05-11 13:53:45,544] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
|
| 16 |
+
[2025-05-11 13:53:45,544] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
|
| 17 |
+
[2025-05-11 13:53:45,544] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
|
| 18 |
+
[2025-05-11 13:53:45,544] [INFO] [launch.py:164:main] dist_world_size=2
|
| 19 |
+
[2025-05-11 13:53:45,544] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
|
| 20 |
+
[2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151596 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=0', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
|
| 21 |
+
[2025-05-11 13:53:45,545] [INFO] [launch.py:256:main] process 1151597 spawned with command: ['/home/panda/miniconda3/envs/pda/bin/python', '-u', '-m', 'safe_rlhf.algorithms.tools_ft', '--local_rank=1', '--train_datasets', 'tools', '--model_name_or_path', 'meta-llama/Llama-3.1-8B-Instruct', '--cache_dir', '/home/panda/pda-llm/cache/sft-tools', '--important_sft', 'true', '--max_length', '4096', '--trust_remote_code', 'True', '--epochs', '3', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '48', '--gradient_checkpointing', '--learning_rate', '1e-4', '--lr_scheduler_type', 'cosine', '--lr_warmup_ratio', '0.1', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', '--log_type', 'wandb', '--log_project', 'TOOLS-SFT', '--zero_stage', '0', '--offload', 'none', '--safety_ratio_tol', '50', '--resilient_coeff', '1', '--lora_r', '4', '--lora_alpha', '32', '--lora_dropout', '0.05', '--gradient_checkpointing', '--bf16', 'True', '--fp16', 'False', '--tf32', 'False']
|
| 22 |
+
[2025-05-11 13:53:46,697] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 23 |
+
[2025-05-11 13:53:46,717] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
| 24 |
+
[2025-05-11 13:53:49,799] [INFO] [comm.py:669:init_distributed] cdb=None
|
| 25 |
+
[2025-05-11 13:53:49,817] [INFO] [comm.py:669:init_distributed] cdb=None
|
| 26 |
+
[2025-05-11 13:53:49,817] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
| 27 |
Set logger level to WARNING.
|
| 28 |
calculating baseline ...
|
| 29 |
calculating baseline ...
|
| 30 |
+
Computing baseline logprobs...
|
|
|
|
| 31 |
ninja: no work to do.
|
| 32 |
+
Time to load fused_adam op: 0.034948110580444336 seconds
|
| 33 |
+
Saving computed baseline logprobs to /home/panda/pda-llm/cache/sft-tools/cached_baseline_logprobs.pt
|
| 34 |
+
Saved baseline logprobs successfully
|
| 35 |
+
ninja: no work to do.
|
| 36 |
+
Time to load fused_adam op: 0.0341794490814209 seconds
|
| 37 |
***** Running training *****
|
| 38 |
|
| 39 |
***** Evaluating at the beginning *****
|
| 40 |
|
| 41 |
+
***** Evaluating at epoch 1/3 *****
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
***** Evaluating at epoch 2/3 *****
|
| 44 |
|
| 45 |
+
***** Evaluating at epoch 3/3 *****
|
| 46 |
+
Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
|
| 47 |
Saving Hugging Face Checkpoints...
|
| 48 |
+
[2025-05-11 17:31:35,204] [INFO] [launch.py:351:main] Process 1151597 exits successfully.
|
| 49 |
Model saved!
|
| 50 |
+
[2025-05-11 17:31:39,205] [INFO] [launch.py:351:main] Process 1151596 exits successfully.
|
|
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 17210383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:089fcc22ecea628694d8dcd8b57815f68d90d070e4012c9964a622d4473b14db
|
| 3 |
size 17210383
|
tokenizer_config.json
CHANGED
|
@@ -2074,7 +2074,7 @@
|
|
| 2074 |
"input_ids",
|
| 2075 |
"attention_mask"
|
| 2076 |
],
|
| 2077 |
-
"model_max_length":
|
| 2078 |
"pad_token": "<pad>",
|
| 2079 |
"padding_side": "right",
|
| 2080 |
"tokenizer_class": "PreTrainedTokenizer",
|
|
|
|
| 2074 |
"input_ids",
|
| 2075 |
"attention_mask"
|
| 2076 |
],
|
| 2077 |
+
"model_max_length": 4096,
|
| 2078 |
"pad_token": "<pad>",
|
| 2079 |
"padding_side": "right",
|
| 2080 |
"tokenizer_class": "PreTrainedTokenizer",
|
wandb/debug-internal.log
CHANGED
|
@@ -1,18 +1,18 @@
|
|
| 1 |
-
{"time":"2025-05-
|
| 2 |
-
{"time":"2025-05-
|
| 3 |
-
{"time":"2025-05-
|
| 4 |
-
{"time":"2025-05-
|
| 5 |
-
{"time":"2025-05-
|
| 6 |
-
{"time":"2025-05-
|
| 7 |
-
{"time":"2025-05-
|
| 8 |
-
{"time":"2025-05-
|
| 9 |
-
{"time":"2025-05-
|
| 10 |
-
{"time":"2025-05-
|
| 11 |
-
{"time":"2025-05-
|
| 12 |
-
{"time":"2025-05-
|
| 13 |
-
{"time":"2025-05-
|
| 14 |
-
{"time":"2025-05-
|
| 15 |
-
{"time":"2025-05-
|
| 16 |
-
{"time":"2025-05-
|
| 17 |
-
{"time":"2025-05-
|
| 18 |
-
{"time":"2025-05-
|
|
|
|
| 1 |
+
{"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
|
| 3 |
+
{"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
|
| 4 |
+
{"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
|
| 5 |
+
{"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
|
| 6 |
+
{"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
|
| 7 |
+
{"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
|
| 9 |
+
{"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
|
| 10 |
+
{"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
|
| 11 |
+
{"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
|
| 12 |
+
{"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 14 |
+
{"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
|
| 15 |
+
{"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
|
| 16 |
+
{"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
|
| 17 |
+
{"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
|
| 18 |
+
{"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}
|
wandb/debug.log
CHANGED
|
@@ -1,29 +1,29 @@
|
|
| 1 |
-
2025-05-
|
| 2 |
-
2025-05-
|
| 3 |
-
2025-05-
|
| 4 |
-
2025-05-
|
| 5 |
-
2025-05-
|
| 6 |
-
2025-05-
|
| 7 |
-
2025-05-
|
| 8 |
-
2025-05-
|
| 9 |
-
2025-05-
|
| 10 |
-
config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length':
|
| 11 |
-
2025-05-
|
| 12 |
-
2025-05-
|
| 13 |
-
2025-05-
|
| 14 |
-
2025-05-
|
| 15 |
-
2025-05-
|
| 16 |
-
2025-05-
|
| 17 |
-
2025-05-
|
| 18 |
-
2025-05-
|
| 19 |
-
2025-05-
|
| 20 |
-
2025-05-
|
| 21 |
-
2025-05-
|
| 22 |
-
2025-05-
|
| 23 |
-
2025-05-
|
| 24 |
-
2025-05-
|
| 25 |
-
2025-05-
|
| 26 |
-
2025-05-
|
| 27 |
-
2025-05-
|
| 28 |
-
2025-05-
|
| 29 |
-
2025-05-
|
|
|
|
| 1 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
|
| 2 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
|
| 3 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
|
| 4 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
|
| 5 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
| 6 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
|
| 7 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
|
| 8 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
|
| 11 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-05-11 14:32:04,250 INFO MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-05-11 14:32:04,250 INFO MainThread:1151596 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-05-11 14:32:04,252 INFO MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
|
| 16 |
+
2025-05-11 14:32:04,257 INFO MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-05-11 14:32:04,568 INFO MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
|
| 18 |
+
2025-05-11 14:32:04,636 INFO MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
|
| 19 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
|
| 20 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
|
| 21 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
|
| 22 |
+
2025-05-11 14:32:04,638 INFO MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
|
| 23 |
+
2025-05-11 17:31:35,325 INFO MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
|
| 24 |
+
2025-05-11 17:31:35,326 INFO MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
|
| 25 |
+
2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2513] restore
|
| 26 |
+
2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2519] restore done
|
| 27 |
+
2025-05-11 17:31:35,957 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
|
| 28 |
+
2025-05-11 17:31:35,964 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
|
| 29 |
+
2025-05-11 17:31:35,968 INFO MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files
|
wandb/run-20250511_143204-ws6emydu/files/config.yaml
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.10
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.11.11
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 11
|
| 10 |
+
- 49
|
| 11 |
+
- 51
|
| 12 |
+
- 55
|
| 13 |
+
- 71
|
| 14 |
+
- 98
|
| 15 |
+
"2":
|
| 16 |
+
- 1
|
| 17 |
+
- 11
|
| 18 |
+
- 49
|
| 19 |
+
- 51
|
| 20 |
+
- 55
|
| 21 |
+
- 71
|
| 22 |
+
- 98
|
| 23 |
+
"3":
|
| 24 |
+
- 2
|
| 25 |
+
- 13
|
| 26 |
+
- 16
|
| 27 |
+
- 23
|
| 28 |
+
- 55
|
| 29 |
+
- 61
|
| 30 |
+
"4": 3.11.11
|
| 31 |
+
"5": 0.19.10
|
| 32 |
+
"6": 4.51.3
|
| 33 |
+
"8":
|
| 34 |
+
- 5
|
| 35 |
+
"12": 0.19.10
|
| 36 |
+
"13": linux-x86_64
|
| 37 |
+
bf16:
|
| 38 |
+
value: true
|
| 39 |
+
cache_dir:
|
| 40 |
+
value: /home/panda/pda-llm/cache/sft-tools
|
| 41 |
+
deepscale:
|
| 42 |
+
value: false
|
| 43 |
+
deepscale_config:
|
| 44 |
+
value: null
|
| 45 |
+
deepspeed:
|
| 46 |
+
value: false
|
| 47 |
+
deepspeed_config:
|
| 48 |
+
value: null
|
| 49 |
+
device:
|
| 50 |
+
value: cuda:0
|
| 51 |
+
epochs:
|
| 52 |
+
value: 3
|
| 53 |
+
eval_datasets:
|
| 54 |
+
value: null
|
| 55 |
+
eval_interval:
|
| 56 |
+
value: 1000000
|
| 57 |
+
eval_split_ratio:
|
| 58 |
+
value: null
|
| 59 |
+
eval_strategy:
|
| 60 |
+
value: epoch
|
| 61 |
+
fp16:
|
| 62 |
+
value: false
|
| 63 |
+
global_rank:
|
| 64 |
+
value: 0
|
| 65 |
+
gradient_accumulation_steps:
|
| 66 |
+
value: 48
|
| 67 |
+
gradient_checkpointing:
|
| 68 |
+
value: true
|
| 69 |
+
important_sft:
|
| 70 |
+
value: true
|
| 71 |
+
local_rank:
|
| 72 |
+
value: 0
|
| 73 |
+
log_dir:
|
| 74 |
+
value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
|
| 75 |
+
log_project:
|
| 76 |
+
value: TOOLS-SFT
|
| 77 |
+
log_run_name:
|
| 78 |
+
value: tools-sft-2025-05-11-14-32-03
|
| 79 |
+
log_type:
|
| 80 |
+
value: wandb
|
| 81 |
+
lora_alpha:
|
| 82 |
+
value: 32
|
| 83 |
+
lora_dropout:
|
| 84 |
+
value: 0.05
|
| 85 |
+
lora_r:
|
| 86 |
+
value: 4
|
| 87 |
+
lr:
|
| 88 |
+
value: 0.0001
|
| 89 |
+
lr_scheduler_type:
|
| 90 |
+
value: COSINE
|
| 91 |
+
lr_warmup_ratio:
|
| 92 |
+
value: 0.1
|
| 93 |
+
max_length:
|
| 94 |
+
value: 4096
|
| 95 |
+
model_name_or_path:
|
| 96 |
+
value: meta-llama/Llama-3.1-8B-Instruct
|
| 97 |
+
need_eval:
|
| 98 |
+
value: true
|
| 99 |
+
num_update_steps_per_epoch:
|
| 100 |
+
value: 112
|
| 101 |
+
offload:
|
| 102 |
+
value: none
|
| 103 |
+
output_dir:
|
| 104 |
+
value: /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096
|
| 105 |
+
per_device_eval_batch_size:
|
| 106 |
+
value: 1
|
| 107 |
+
per_device_train_batch_size:
|
| 108 |
+
value: 1
|
| 109 |
+
recompute_baseline:
|
| 110 |
+
value: false
|
| 111 |
+
resilient_coeff:
|
| 112 |
+
value: 1
|
| 113 |
+
safety_ratio_tol:
|
| 114 |
+
value: 50
|
| 115 |
+
save_16bit:
|
| 116 |
+
value: false
|
| 117 |
+
save_interval:
|
| 118 |
+
value: 1000000
|
| 119 |
+
seed:
|
| 120 |
+
value: 42
|
| 121 |
+
tf32:
|
| 122 |
+
value: false
|
| 123 |
+
total_training_steps:
|
| 124 |
+
value: 336
|
| 125 |
+
train_datasets:
|
| 126 |
+
value:
|
| 127 |
+
- - tools
|
| 128 |
+
- proportion: 1
|
| 129 |
+
trust_remote_code:
|
| 130 |
+
value: true
|
| 131 |
+
weight_decay:
|
| 132 |
+
value: 0
|
| 133 |
+
zero_stage:
|
| 134 |
+
value: 0
|
wandb/run-20250511_143204-ws6emydu/files/output.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Training 3/3 epoch (loss 0.4680): 100%|██████████| 16050/16050 [2:59:28<00:00, 1.49it/s]
|
| 3 |
+
|
| 4 |
+
***** Evaluating at the beginning *****
|
| 5 |
+
|
| 6 |
+
***** Evaluating at epoch 1/3 *****
|
| 7 |
+
|
| 8 |
+
***** Evaluating at epoch 2/3 *****
|
| 9 |
+
|
| 10 |
+
***** Evaluating at epoch 3/3 *****
|
| 11 |
+
Saving model to "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096" ...
|
| 12 |
+
Saving Hugging Face Checkpoints...
|
| 13 |
+
/home/panda/miniconda3/envs/pda/lib/python3.11/site-packages/peft/utils/save_and_load.py:220: UserWarning: Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.
|
| 14 |
+
warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
|
| 15 |
+
Model saved!
|
wandb/run-20250511_143204-ws6emydu/files/requirements.txt
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PySocks==1.7.1
|
| 2 |
+
pip==25.1.1
|
| 3 |
+
certifi==2025.4.26
|
| 4 |
+
parso==0.8.4
|
| 5 |
+
wcwidth==0.2.13
|
| 6 |
+
nvidia-ml-py==12.535.133
|
| 7 |
+
mkl_fft==1.3.11
|
| 8 |
+
urllib3==2.3.0
|
| 9 |
+
charset-normalizer==3.3.2
|
| 10 |
+
transformers==4.51.3
|
| 11 |
+
smmap==4.0.0
|
| 12 |
+
xxhash==3.5.0
|
| 13 |
+
etils==1.12.2
|
| 14 |
+
platformdirs==4.3.8
|
| 15 |
+
tzdata==2025.2
|
| 16 |
+
ipython==9.2.0
|
| 17 |
+
deepspeed==0.16.7
|
| 18 |
+
gmpy2==2.2.1
|
| 19 |
+
importlib_resources==6.5.2
|
| 20 |
+
pydantic==2.10.3
|
| 21 |
+
tqdm==4.67.1
|
| 22 |
+
pyparsing==3.2.0
|
| 23 |
+
exceptiongroup==1.2.2
|
| 24 |
+
tokenizers==0.21.1
|
| 25 |
+
Markdown==3.8
|
| 26 |
+
matplotlib-inline==0.1.7
|
| 27 |
+
dm_control==1.0.30
|
| 28 |
+
jupyter_core==5.7.2
|
| 29 |
+
pydantic_core==2.27.1
|
| 30 |
+
hjson==3.1.0
|
| 31 |
+
mkl_random==1.2.8
|
| 32 |
+
ipykernel==6.29.5
|
| 33 |
+
PyOpenGL==3.1.9
|
| 34 |
+
sentry-sdk==2.18.0
|
| 35 |
+
wandb==0.19.10
|
| 36 |
+
einops==0.8.1
|
| 37 |
+
prompt_toolkit==3.0.51
|
| 38 |
+
Bottleneck==1.4.2
|
| 39 |
+
msgpack==1.1.0
|
| 40 |
+
fsspec==2024.12.0
|
| 41 |
+
labmaze==1.0.6
|
| 42 |
+
mdurl==0.1.0
|
| 43 |
+
executing==2.2.0
|
| 44 |
+
wheel==0.45.1
|
| 45 |
+
bitsandbytes==0.45.5
|
| 46 |
+
networkx==3.4.2
|
| 47 |
+
comm==0.2.2
|
| 48 |
+
asttokens==3.0.0
|
| 49 |
+
nvitop==1.5.0
|
| 50 |
+
sympy==1.13.1
|
| 51 |
+
MarkupSafe==3.0.2
|
| 52 |
+
numexpr==2.10.1
|
| 53 |
+
tensorboard_data_server==0.7.0
|
| 54 |
+
markdown-it-py==2.2.0
|
| 55 |
+
wrapt==1.17.2
|
| 56 |
+
six==1.17.0
|
| 57 |
+
idna==3.7
|
| 58 |
+
docker-pycreds==0.4.0
|
| 59 |
+
jedi==0.19.2
|
| 60 |
+
GitPython==3.1.43
|
| 61 |
+
dm-env==1.6
|
| 62 |
+
pure_eval==0.2.3
|
| 63 |
+
ninja==1.11.1.4
|
| 64 |
+
setuptools==80.1.0
|
| 65 |
+
fonttools==4.55.3
|
| 66 |
+
annotated-types==0.6.0
|
| 67 |
+
psutil==7.0.0
|
| 68 |
+
cycler==0.11.0
|
| 69 |
+
py-cpuinfo==9.0.0
|
| 70 |
+
accelerate==1.6.0
|
| 71 |
+
dm-tree==0.1.9
|
| 72 |
+
filelock==3.17.0
|
| 73 |
+
pytz==2024.1
|
| 74 |
+
rich==13.9.4
|
| 75 |
+
appdirs==1.4.4
|
| 76 |
+
click==8.1.8
|
| 77 |
+
Jinja2==3.1.6
|
| 78 |
+
unicodedata2==15.1.0
|
| 79 |
+
pillow==11.1.0
|
| 80 |
+
glfw==2.9.0
|
| 81 |
+
traitlets==5.14.3
|
| 82 |
+
stack_data==0.6.3
|
| 83 |
+
peft==0.15.2
|
| 84 |
+
python-dateutil==2.9.0.post0
|
| 85 |
+
requests==2.32.3
|
| 86 |
+
attrs==25.3.0
|
| 87 |
+
Werkzeug==3.1.3
|
| 88 |
+
gitdb==4.0.7
|
| 89 |
+
lxml==5.4.0
|
| 90 |
+
torch==2.5.1
|
| 91 |
+
scipy==1.15.2
|
| 92 |
+
sentencepiece==0.2.0
|
| 93 |
+
huggingface-hub==0.31.1
|
| 94 |
+
dill==0.3.8
|
| 95 |
+
pexpect==4.9.0
|
| 96 |
+
pickleshare==0.7.5
|
| 97 |
+
ptyprocess==0.7.0
|
| 98 |
+
contourpy==1.3.1
|
| 99 |
+
grpcio==1.71.0
|
| 100 |
+
optree==0.14.1
|
| 101 |
+
safetensors==0.5.3
|
| 102 |
+
mpmath==1.3.0
|
| 103 |
+
nest_asyncio==1.6.0
|
| 104 |
+
pyarrow==19.0.0
|
| 105 |
+
PyYAML==6.0.2
|
| 106 |
+
decorator==5.2.1
|
| 107 |
+
pandas==2.2.3
|
| 108 |
+
tensorboard==2.19.0
|
| 109 |
+
zipp==3.21.0
|
| 110 |
+
mujoco==3.3.2
|
| 111 |
+
regex==2024.11.6
|
| 112 |
+
Brotli==1.0.9
|
| 113 |
+
mkl-service==2.4.0
|
| 114 |
+
matplotlib==3.10.0
|
| 115 |
+
Pygments==2.19.1
|
| 116 |
+
absl-py==2.1.0
|
| 117 |
+
hf-xet==1.1.0
|
| 118 |
+
packaging==25.0
|
| 119 |
+
jupyter_client==8.6.3
|
| 120 |
+
triton==3.1.0
|
| 121 |
+
multiprocess==0.70.15
|
| 122 |
+
debugpy==1.8.14
|
| 123 |
+
numpy==2.0.1
|
| 124 |
+
protobuf==5.29.3
|
| 125 |
+
tornado==6.4.2
|
| 126 |
+
datasets==3.6.0
|
| 127 |
+
eval_type_backport==0.2.2
|
| 128 |
+
typing_extensions==4.12.2
|
| 129 |
+
kiwisolver==1.4.8
|
| 130 |
+
pyzmq==26.4.0
|
| 131 |
+
ipython_pygments_lexers==1.1.1
|
| 132 |
+
setproctitle==1.2.2
|
| 133 |
+
importlib_metadata==8.6.1
|
| 134 |
+
jaraco.text==3.12.1
|
| 135 |
+
backports.tarfile==1.2.0
|
| 136 |
+
importlib_metadata==8.0.0
|
| 137 |
+
jaraco.context==5.3.0
|
| 138 |
+
more-itertools==10.3.0
|
| 139 |
+
wheel==0.45.1
|
| 140 |
+
jaraco.collections==5.1.0
|
| 141 |
+
platformdirs==4.2.2
|
| 142 |
+
autocommand==2.2.2
|
| 143 |
+
zipp==3.19.2
|
| 144 |
+
tomli==2.0.1
|
| 145 |
+
typeguard==4.3.0
|
| 146 |
+
packaging==24.2
|
| 147 |
+
inflect==7.3.1
|
| 148 |
+
typing_extensions==4.12.2
|
| 149 |
+
jaraco.functools==4.0.1
|
wandb/run-20250511_143204-ws6emydu/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-138-generic-x86_64-with-glibc2.31",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-05-11T18:32:04.251126Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--train_datasets",
|
| 8 |
+
"tools",
|
| 9 |
+
"--model_name_or_path",
|
| 10 |
+
"meta-llama/Llama-3.1-8B-Instruct",
|
| 11 |
+
"--cache_dir",
|
| 12 |
+
"/home/panda/pda-llm/cache/sft-tools",
|
| 13 |
+
"--important_sft",
|
| 14 |
+
"true",
|
| 15 |
+
"--max_length",
|
| 16 |
+
"4096",
|
| 17 |
+
"--trust_remote_code",
|
| 18 |
+
"True",
|
| 19 |
+
"--epochs",
|
| 20 |
+
"3",
|
| 21 |
+
"--per_device_train_batch_size",
|
| 22 |
+
"1",
|
| 23 |
+
"--per_device_eval_batch_size",
|
| 24 |
+
"1",
|
| 25 |
+
"--gradient_accumulation_steps",
|
| 26 |
+
"48",
|
| 27 |
+
"--gradient_checkpointing",
|
| 28 |
+
"--learning_rate",
|
| 29 |
+
"1e-4",
|
| 30 |
+
"--lr_scheduler_type",
|
| 31 |
+
"cosine",
|
| 32 |
+
"--lr_warmup_ratio",
|
| 33 |
+
"0.1",
|
| 34 |
+
"--weight_decay",
|
| 35 |
+
"0.0",
|
| 36 |
+
"--seed",
|
| 37 |
+
"42",
|
| 38 |
+
"--output_dir",
|
| 39 |
+
"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
|
| 40 |
+
"--log_type",
|
| 41 |
+
"wandb",
|
| 42 |
+
"--log_project",
|
| 43 |
+
"TOOLS-SFT",
|
| 44 |
+
"--zero_stage",
|
| 45 |
+
"0",
|
| 46 |
+
"--offload",
|
| 47 |
+
"none",
|
| 48 |
+
"--safety_ratio_tol",
|
| 49 |
+
"50",
|
| 50 |
+
"--resilient_coeff",
|
| 51 |
+
"1",
|
| 52 |
+
"--lora_r",
|
| 53 |
+
"4",
|
| 54 |
+
"--lora_alpha",
|
| 55 |
+
"32",
|
| 56 |
+
"--lora_dropout",
|
| 57 |
+
"0.05",
|
| 58 |
+
"--gradient_checkpointing",
|
| 59 |
+
"--bf16",
|
| 60 |
+
"True",
|
| 61 |
+
"--fp16",
|
| 62 |
+
"False",
|
| 63 |
+
"--tf32",
|
| 64 |
+
"False"
|
| 65 |
+
],
|
| 66 |
+
"program": "-m safe_rlhf.algorithms.tools_ft.__main__",
|
| 67 |
+
"git": {
|
| 68 |
+
"remote": "git@github.com:IgnacioBoero/pda-llm.git",
|
| 69 |
+
"commit": "f4c70e99280e869cd565464446c9942daaf22425"
|
| 70 |
+
},
|
| 71 |
+
"email": "iboero@upenn.edu",
|
| 72 |
+
"root": "/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096",
|
| 73 |
+
"host": "plaza",
|
| 74 |
+
"executable": "/home/panda/miniconda3/envs/pda/bin/python",
|
| 75 |
+
"cpu_count": 24,
|
| 76 |
+
"cpu_count_logical": 48,
|
| 77 |
+
"gpu": "NVIDIA RTX A6000",
|
| 78 |
+
"gpu_count": 2,
|
| 79 |
+
"disk": {
|
| 80 |
+
"/": {
|
| 81 |
+
"total": "982820896768",
|
| 82 |
+
"used": "915338317824"
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"memory": {
|
| 86 |
+
"total": "270090010624"
|
| 87 |
+
},
|
| 88 |
+
"cpu": {
|
| 89 |
+
"count": 24,
|
| 90 |
+
"countLogical": 48
|
| 91 |
+
},
|
| 92 |
+
"gpu_nvidia": [
|
| 93 |
+
{
|
| 94 |
+
"name": "NVIDIA RTX A6000",
|
| 95 |
+
"memoryTotal": "51527024640",
|
| 96 |
+
"cudaCores": 10752,
|
| 97 |
+
"architecture": "Ampere"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"name": "NVIDIA RTX A6000",
|
| 101 |
+
"memoryTotal": "51527024640",
|
| 102 |
+
"cudaCores": 10752,
|
| 103 |
+
"architecture": "Ampere"
|
| 104 |
+
}
|
| 105 |
+
],
|
| 106 |
+
"cudaVersion": "12.4"
|
| 107 |
+
}
|
wandb/run-20250511_143204-ws6emydu/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_runtime":10771.076681318,"_step":16050,"train/step":16050,"eval/hist_log_ratio":{"values":[1,1,2,0,2,0,1,2,0,3,4,5,10,4,2,7,5,6,2,3,6,9,6,6,13,5,1,2,8,5,6,6,4,5,4,6,3,2,4,4,3,0,0,4,1,2,4,2,0,0,3,0,3,2,4,0,3,1,0,0,1,1,0,1],"bins":[77.5,81.5,85.5,89.5,93.5,97.5,101.5625,105.5625,109.5625,113.5625,117.5625,121.625,125.625,129.625,133.625,137.625,141.625,145.625,149.625,153.625,157.625,161.75,165.75,169.75,173.75,177.75,181.75,185.75,189.75,193.75,197.75,201.75,205.75,209.75,213.75,217.75,221.75,225.75,229.75,233.75,237.75,241.875,245.875,249.875,253.875,258,262,266,270,274,278,282,286,290,294,298,302,306,310,314,318,322,326,330,334],"_type":"histogram"},"train/loss":0.4680471122264862,"eval/step":16050,"eval/min_log_ratio":77.5,"eval/max_log_ratio":334,"_timestamp":1.7469990929702783e+09,"_wandb":{"runtime":10771},"train/lr":1.0749763901607423e-08,"eval/mean_important_log_ratio":188,"train/epoch":3}
|
wandb/run-20250511_143204-ws6emydu/logs/debug-core.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-05-11T14:32:03.899110462-04:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmplwnn6mri/port-1151596.txt","pid":1151596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-05-11T14:32:03.905019894-04:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":1151596}
|
| 3 |
+
{"time":"2025-05-11T14:32:03.904995063-04:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35011,"Zone":""}}
|
| 4 |
+
{"time":"2025-05-11T14:32:04.083172335-04:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37156"}
|
| 5 |
+
{"time":"2025-05-11T14:32:04.25319672-04:00","level":"INFO","msg":"handleInformInit: received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
|
| 6 |
+
{"time":"2025-05-11T14:32:04.439955459-04:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"ws6emydu","id":"127.0.0.1:37156"}
|
| 7 |
+
{"time":"2025-05-11T17:31:35.968564433-04:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"ws6emydu","id":"127.0.0.1:37156"}
|
| 8 |
+
{"time":"2025-05-11T17:31:35.968735567-04:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"ws6emydu","id":"127.0.0.1:37156"}
|
| 9 |
+
{"time":"2025-05-11T17:31:36.95829108-04:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:37156"}
|
| 10 |
+
{"time":"2025-05-11T17:31:36.958316571-04:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:37156"}
|
| 11 |
+
{"time":"2025-05-11T17:31:36.958328081-04:00","level":"INFO","msg":"server is shutting down"}
|
| 12 |
+
{"time":"2025-05-11T17:31:36.958329541-04:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:37156"}
|
| 13 |
+
{"time":"2025-05-11T17:31:36.958416902-04:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:37156"}
|
| 14 |
+
{"time":"2025-05-11T17:31:36.958428293-04:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:37156"}
|
| 15 |
+
{"time":"2025-05-11T17:31:36.958438183-04:00","level":"INFO","msg":"server is closed"}
|
wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-05-11T14:32:04.253445085-04:00","level":"INFO","msg":"stream: starting","core version":"0.19.10","symlink path":"/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-05-11T14:32:04.439907308-04:00","level":"INFO","msg":"created new stream","id":"ws6emydu"}
|
| 3 |
+
{"time":"2025-05-11T14:32:04.439946668-04:00","level":"INFO","msg":"stream: started","id":"ws6emydu"}
|
| 4 |
+
{"time":"2025-05-11T14:32:04.439967179-04:00","level":"INFO","msg":"writer: Do: started","stream_id":"ws6emydu"}
|
| 5 |
+
{"time":"2025-05-11T14:32:04.44000952-04:00","level":"INFO","msg":"sender: started","stream_id":"ws6emydu"}
|
| 6 |
+
{"time":"2025-05-11T14:32:04.44003911-04:00","level":"INFO","msg":"handler: started","stream_id":"ws6emydu"}
|
| 7 |
+
{"time":"2025-05-11T14:32:04.571220665-04:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-05-11T14:32:04.571254576-04:00","level":"WARN","msg":"handleCodeSave: program relative path is empty"}
|
| 9 |
+
{"time":"2025-05-11T14:32:04.571325347-04:00","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
|
| 10 |
+
{"time":"2025-05-11T17:31:35.327822679-04:00","level":"INFO","msg":"Stopping system monitor"}
|
| 11 |
+
{"time":"2025-05-11T17:31:35.32789325-04:00","level":"INFO","msg":"Stopped system monitor"}
|
| 12 |
+
{"time":"2025-05-11T17:31:35.835375381-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 13 |
+
{"time":"2025-05-11T17:31:35.955410223-04:00","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 14 |
+
{"time":"2025-05-11T17:31:35.968603034-04:00","level":"INFO","msg":"stream: closing","id":"ws6emydu"}
|
| 15 |
+
{"time":"2025-05-11T17:31:35.968621514-04:00","level":"INFO","msg":"handler: closed","stream_id":"ws6emydu"}
|
| 16 |
+
{"time":"2025-05-11T17:31:35.968633064-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":"ws6emydu"}
|
| 17 |
+
{"time":"2025-05-11T17:31:35.968716616-04:00","level":"INFO","msg":"sender: closed","stream_id":"ws6emydu"}
|
| 18 |
+
{"time":"2025-05-11T17:31:35.968727736-04:00","level":"INFO","msg":"stream: closed","id":"ws6emydu"}
|
wandb/run-20250511_143204-ws6emydu/logs/debug.log
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Current SDK version is 0.19.10
|
| 2 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Configure stats pid to 1151596
|
| 3 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/.config/wandb/settings
|
| 4 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from /home/panda/pda-llm/scripts/wandb/settings
|
| 5 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
| 6 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug.log
|
| 7 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096/wandb/run-20250511_143204-ws6emydu/logs/debug-internal.log
|
| 8 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():852] calling init triggers
|
| 9 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'model_name_or_path': 'meta-llama/Llama-3.1-8B-Instruct', 'recompute_baseline': False, 'cache_dir': '/home/panda/pda-llm/cache/sft-tools', 'max_length': 4096, 'trust_remote_code': True, 'train_datasets': [('tools', {'proportion': 1.0})], 'eval_datasets': None, 'safety_ratio_tol': 50.0, 'important_sft': True, 'resilient_coeff': 1.0, 'epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 48, 'gradient_checkpointing': True, 'lr': 0.0001, 'lr_scheduler_type': <SchedulerType.COSINE: 'cosine'>, 'lr_warmup_ratio': 0.1, 'weight_decay': 0.0, 'seed': 42, 'fp16': False, 'bf16': True, 'tf32': False, 'lora_r': 4, 'lora_alpha': 32, 'lora_dropout': 0.05, 'eval_strategy': 'epoch', 'eval_interval': 1000000, 'need_eval': True, 'eval_split_ratio': None, 'output_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_type': 'wandb', 'log_dir': '/home/panda/pda-llm/output/sft-tools/run-true-1-50-4-4096', 'log_project': 'TOOLS-SFT', 'log_run_name': 'tools-sft-2025-05-11-14-32-03', 'save_16bit': False, 'save_interval': 1000000, 'local_rank': 0, 'zero_stage': 0, 'offload': 'none', 'deepspeed': False, 'deepspeed_config': None, 'deepscale': False, 'deepscale_config': None, 'global_rank': 0, 'device': device(type='cuda', index=0), 'num_update_steps_per_epoch': 112, 'total_training_steps': 336, '_wandb': {}}
|
| 11 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():893] starting backend
|
| 12 |
+
2025-05-11 14:32:04,248 INFO MainThread:1151596 [wandb_init.py:init():897] sending inform_init request
|
| 13 |
+
2025-05-11 14:32:04,250 INFO MainThread:1151596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-05-11 14:32:04,250 INFO MainThread:1151596 [wandb_init.py:init():907] backend started and connected
|
| 15 |
+
2025-05-11 14:32:04,252 INFO MainThread:1151596 [wandb_init.py:init():1002] updated telemetry
|
| 16 |
+
2025-05-11 14:32:04,257 INFO MainThread:1151596 [wandb_init.py:init():1026] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-05-11 14:32:04,568 INFO MainThread:1151596 [wandb_init.py:init():1101] starting run threads in backend
|
| 18 |
+
2025-05-11 14:32:04,636 INFO MainThread:1151596 [wandb_run.py:_console_start():2566] atexit reg
|
| 19 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2414] redirect: wrap_raw
|
| 20 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2483] Wrapping output streams.
|
| 21 |
+
2025-05-11 14:32:04,637 INFO MainThread:1151596 [wandb_run.py:_redirect():2506] Redirects installed.
|
| 22 |
+
2025-05-11 14:32:04,638 INFO MainThread:1151596 [wandb_init.py:init():1147] run started, returning control to user process
|
| 23 |
+
2025-05-11 17:31:35,325 INFO MainThread:1151596 [wandb_run.py:_finish():2314] finishing run alelab/TOOLS-SFT/ws6emydu
|
| 24 |
+
2025-05-11 17:31:35,326 INFO MainThread:1151596 [wandb_run.py:_atexit_cleanup():2531] got exitcode: 0
|
| 25 |
+
2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2513] restore
|
| 26 |
+
2025-05-11 17:31:35,327 INFO MainThread:1151596 [wandb_run.py:_restore():2519] restore done
|
| 27 |
+
2025-05-11 17:31:35,957 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4160] rendering history
|
| 28 |
+
2025-05-11 17:31:35,964 INFO MainThread:1151596 [wandb_run.py:_footer_history_summary_info():4192] rendering summary
|
| 29 |
+
2025-05-11 17:31:35,968 INFO MainThread:1151596 [wandb_run.py:_footer_sync_info():4121] logging synced files
|
wandb/run-20250511_143204-ws6emydu/run-ws6emydu.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d75c95736964eb77dd5d326d209df011e412e887998fb330d51e0946a94f7cc4
|
| 3 |
+
size 14008017
|