Yaning1001 commited on
Commit
ffe32b5
·
verified ·
1 Parent(s): 5b0bb18

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json +97 -0
  2. wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json +1 -0
  3. wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log +16 -0
  4. wandb/run-20241030_010759-dim9v1es/logs/debug.log +27 -0
  5. wandb/run-20241030_012617-yt7vh1dq/files/output.log +2 -0
  6. wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt +147 -0
  7. wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json +97 -0
  8. wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log +8 -0
  9. wandb/run-20241030_012617-yt7vh1dq/logs/debug.log +26 -0
  10. wandb/run-20241030_013141-bkcoggdw/files/config.yaml +47 -0
  11. wandb/run-20241030_013141-bkcoggdw/files/output.log +62 -0
  12. wandb/run-20241030_013141-bkcoggdw/files/requirements.txt +147 -0
  13. wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json +97 -0
  14. wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json +1 -0
  15. wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log +12 -0
  16. wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json +97 -0
  17. wandb/run-20241030_225833-giupspdj/logs/debug-internal.log +8 -0
  18. wandb/run-20241031_001055-32u9qnul/files/output.log +13 -0
  19. wandb/run-20241031_001055-32u9qnul/files/requirements.txt +147 -0
  20. wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json +97 -0
  21. wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json +97 -0
  22. wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log +8 -0
  23. wandb/run-20241031_114700-3cqkhntc/files/requirements.txt +147 -0
  24. wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json +97 -0
  25. wandb/run-20241031_114700-q0d78n2b/files/requirements.txt +147 -0
  26. wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json +97 -0
  27. wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log +8 -0
  28. wandb/run-20241031_114700-q0d78n2b/logs/debug.log +26 -0
  29. wandb/run-20241031_122005-nip14lm6/files/config.yaml +49 -0
  30. wandb/run-20241031_122005-nip14lm6/files/output.log +35 -0
  31. wandb/run-20241031_122005-nip14lm6/files/requirements.txt +147 -0
  32. wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json +97 -0
  33. wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json +1 -0
  34. wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb +0 -0
  35. wandb/run-20241101_012613-k6o0lha8/files/output.log +12 -0
  36. wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json +97 -0
  37. wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb +0 -0
  38. wandb/run-20241101_012733-9v55tr72/files/output.log +196 -0
  39. wandb/run-20241101_012733-9v55tr72/files/requirements.txt +147 -0
  40. wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json +97 -0
  41. wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log +8 -0
  42. wandb/run-20241101_012733-9v55tr72/logs/debug.log +29 -0
  43. wandb/run-20241101_094656-ae4hctp0/files/output.log +13 -0
  44. wandb/run-20241101_094656-ae4hctp0/files/requirements.txt +147 -0
  45. wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json +97 -0
  46. wandb/run-20241101_094656-ae4hctp0/logs/debug.log +26 -0
  47. wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb +0 -0
  48. wandb/run-20241101_200517-77b12390/files/output.log +57 -0
  49. wandb/run-20241101_200517-77b12390/files/wandb-metadata.json +97 -0
  50. wandb/run-20241101_200517-77b12390/files/wandb-summary.json +1 -0
wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T05:07:59.121382Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1719200272384"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":0}}
wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T01:07:59.123018178-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T01:07:59.123029468-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-core.log"}
3
+ {"time":"2024-10-30T01:07:59.228528967-04:00","level":"INFO","msg":"created new stream","id":"dim9v1es"}
4
+ {"time":"2024-10-30T01:07:59.228567837-04:00","level":"INFO","msg":"stream: started","id":"dim9v1es"}
5
+ {"time":"2024-10-30T01:07:59.228581067-04:00","level":"INFO","msg":"sender: started","stream_id":"dim9v1es"}
6
+ {"time":"2024-10-30T01:07:59.228568237-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"dim9v1es"}}
7
+ {"time":"2024-10-30T01:07:59.228568187-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"dim9v1es"}}
8
+ {"time":"2024-10-30T01:07:59.441316995-04:00","level":"INFO","msg":"Starting system monitor"}
9
+ {"time":"2024-10-30T01:07:59.536719185-04:00","level":"INFO","msg":"stream: closing","id":"dim9v1es"}
10
+ {"time":"2024-10-30T01:07:59.536770865-04:00","level":"INFO","msg":"Stopping system monitor"}
11
+ {"time":"2024-10-30T01:07:59.53739974-04:00","level":"INFO","msg":"Stopped system monitor"}
12
+ {"time":"2024-10-30T01:08:00.081295733-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
13
+ {"time":"2024-10-30T01:08:00.206167083-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"dim9v1es"}}
14
+ {"time":"2024-10-30T01:08:00.206226274-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"dim9v1es"}}
15
+ {"time":"2024-10-30T01:08:00.206291324-04:00","level":"INFO","msg":"sender: closed","stream_id":"dim9v1es"}
16
+ {"time":"2024-10-30T01:08:00.206325864-04:00","level":"INFO","msg":"stream: closed","id":"dim9v1es"}
wandb/run-20241030_010759-dim9v1es/logs/debug.log ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Configure stats pid to 322462
3
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug.log
10
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log
11
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 01:07:59,120 INFO MainThread:322462 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 01:07:59,121 INFO MainThread:322462 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 01:07:59,124 INFO MainThread:322462 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 01:07:59,156 INFO MainThread:322462 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 01:07:59,438 INFO MainThread:322462 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 01:07:59,535 INFO MainThread:322462 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 01:07:59,535 INFO MainThread:322462 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
27
+ 2024-10-30 01:07:59,536 WARNING MsgRouterThr:322462 [router.py:message_loop():77] message_loop has been closed
wandb/run-20241030_012617-yt7vh1dq/files/output.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.36s/it]
2
+ Map: 11%|██████████████ | 2000/18140 [00:06<00:52, 308.60 examples/s]
wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T05:26:17.324794Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1709772775424"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T01:26:17.327161166-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T01:26:17.327175976-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-core.log"}
3
+ {"time":"2024-10-30T01:26:17.435118413-04:00","level":"INFO","msg":"created new stream","id":"yt7vh1dq"}
4
+ {"time":"2024-10-30T01:26:17.435165823-04:00","level":"INFO","msg":"stream: started","id":"yt7vh1dq"}
5
+ {"time":"2024-10-30T01:26:17.435237323-04:00","level":"INFO","msg":"sender: started","stream_id":"yt7vh1dq"}
6
+ {"time":"2024-10-30T01:26:17.435183773-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"yt7vh1dq"}}
7
+ {"time":"2024-10-30T01:26:17.435252003-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"yt7vh1dq"}}
8
+ {"time":"2024-10-30T01:26:17.695977809-04:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241030_012617-yt7vh1dq/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Configure stats pid to 332624
3
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log
10
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log
11
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():671] starting backend
15
+ 2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-30 01:26:17,324 INFO MainThread:332624 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-30 01:26:17,324 INFO MainThread:332624 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-30 01:26:17,328 INFO MainThread:332624 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-30 01:26:17,385 INFO MainThread:332624 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-30 01:26:17,692 INFO MainThread:332624 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-30 01:26:17,849 INFO MainThread:332624 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-30 01:26:17,850 INFO MainThread:332624 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
wandb/run-20241030_013141-bkcoggdw/files/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.9.19
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 51
13
+ - 53
14
+ - 55
15
+ - 71
16
+ - 98
17
+ "2":
18
+ - 1
19
+ - 5
20
+ - 11
21
+ - 49
22
+ - 51
23
+ - 53
24
+ - 55
25
+ - 71
26
+ - 98
27
+ "3":
28
+ - 13
29
+ - 23
30
+ - 55
31
+ "4": 3.9.19
32
+ "5": 0.18.5
33
+ "6": 4.45.1
34
+ "8":
35
+ - 5
36
+ "12": 0.18.5
37
+ "13": linux-x86_64
38
+ batch_size:
39
+ value: 3
40
+ epoch:
41
+ value: 7
42
+ perturbation:
43
+ value: reverse_full
44
+ seed:
45
+ value: 0
46
+ train_set:
47
+ value: 10M
wandb/run-20241030_013141-bkcoggdw/files/output.log ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb: 500 encountered ({"errors":[{"message":"An internal error occurred. Please contact support.","path":["upsertBucket"]}],"data":{"upsertBucket":null}}), retrying request
2
+ model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 11.2MB/s]
3
+ Downloading shards: 0%| | 0/2 [01:04<?, ?it/s]
4
+ Traceback (most recent call last): 54%|██████████████████████████████████████████████████████████▎ | 2.71G/4.97G [01:04<00:53, 42.2MB/s]
5
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 541, in http_get
6
+ for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
7
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 820, in generate
8
+ yield from self.raw.stream(chunk_size, decode_content=True)
9
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 934, in stream
10
+ data = self.read(amt=amt, decode_content=decode_content)
11
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 877, in read
12
+ data = self._raw_read(amt)
13
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 812, in _raw_read
14
+ data = self._fp_read(amt) if not fp_closed else b""
15
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 789, in _fp_read
16
+ data = self._fp.read(chunk_amt)
17
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/http/client.py", line 463, in read
18
+ n = self.readinto(b)
19
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/http/client.py", line 507, in readinto
20
+ n = self.fp.readinto(b)
21
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/socket.py", line 704, in readinto
22
+ return self._sock.recv_into(b)
23
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/ssl.py", line 1275, in recv_into
24
+ return self.read(nbytes, buffer)
25
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/ssl.py", line 1133, in read
26
+ return self._sslobj.read(len, buffer)
27
+ KeyboardInterrupt
28
+
29
+ During handling of the above exception, another exception occurred:
30
+
31
+ Traceback (most recent call last):
32
+ File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 172, in <module>
33
+ model = AutoModelForCausalLM.from_pretrained(model_name,
34
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
35
+ return model_class.from_pretrained(
36
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
37
+ resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
38
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
39
+ cached_filename = cached_file(
40
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
41
+ resolved_file = hf_hub_download(
42
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
43
+ return f(*args, **kwargs)
44
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
45
+ return fn(*args, **kwargs)
46
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
47
+ return _hf_hub_download_to_cache_dir(
48
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1381, in _hf_hub_download_to_cache_dir
49
+ _download_to_tmp_and_move(
50
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1915, in _download_to_tmp_and_move
51
+ http_get(
52
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 558, in http_get
53
+ return http_get(
54
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1140, in __exit__
55
+ self.close()
56
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1275, in close
57
+ self._decr_instances(self)
58
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 696, in _decr_instances
59
+ with cls._lock:
60
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 110, in __enter__
61
+ def __enter__(self):
62
+ KeyboardInterrupt
wandb/run-20241030_013141-bkcoggdw/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T05:31:41.693480Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1709824425984"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":94}}
wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T01:31:41.69578299-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T01:31:41.6958064-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-bkcoggdw/logs/debug-core.log"}
3
+ {"time":"2024-10-30T01:31:41.804007808-04:00","level":"INFO","msg":"created new stream","id":"bkcoggdw"}
4
+ {"time":"2024-10-30T01:31:41.804049448-04:00","level":"INFO","msg":"stream: started","id":"bkcoggdw"}
5
+ {"time":"2024-10-30T01:31:41.804096128-04:00","level":"INFO","msg":"sender: started","stream_id":"bkcoggdw"}
6
+ {"time":"2024-10-30T01:31:41.804097678-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"bkcoggdw"}}
7
+ {"time":"2024-10-30T01:31:41.804074928-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"bkcoggdw"}}
8
+ {"time":"2024-10-30T01:31:41.90989171-04:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql"}
9
+ {"time":"2024-10-30T01:31:44.314369215-04:00","level":"INFO","msg":"Starting system monitor"}
10
+ {"time":"2024-10-30T01:33:16.64084496-04:00","level":"INFO","msg":"stream: closing","id":"bkcoggdw"}
11
+ {"time":"2024-10-30T01:33:16.64087647-04:00","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2024-10-30T01:33:16.641390294-04:00","level":"INFO","msg":"Stopped system monitor"}
wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-30T15:28:52.925806Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1710831611904"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241030_225833-giupspdj/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-30T22:58:33.52447176-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-30T22:58:33.52448387-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug-core.log"}
3
+ {"time":"2024-10-30T22:58:33.631043427-04:00","level":"INFO","msg":"created new stream","id":"giupspdj"}
4
+ {"time":"2024-10-30T22:58:33.631075407-04:00","level":"INFO","msg":"stream: started","id":"giupspdj"}
5
+ {"time":"2024-10-30T22:58:33.631121257-04:00","level":"INFO","msg":"sender: started","stream_id":"giupspdj"}
6
+ {"time":"2024-10-30T22:58:33.631092947-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"giupspdj"}}
7
+ {"time":"2024-10-30T22:58:33.631105957-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"giupspdj"}}
8
+ {"time":"2024-10-30T22:58:33.831702761-04:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241031_001055-32u9qnul/files/output.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.84s/it]
2
+ tokenized_valid: Dataset({
3
+ features: ['input_ids', 'attention_mask'],
4
+ num_rows: 600
5
+ })
6
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
7
+ warnings.warn(
8
+ [2024-10-31 00:11:03,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-10-31 00:11:12,645] [INFO] [comm.py:652:init_distributed] cdb=None
10
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
11
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
12
+ Loading extension module cpu_adam...
13
+ Time to load cpu_adam op: 5.372655630111694 seconds
wandb/run-20241031_001055-32u9qnul/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T04:10:55.973455Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1728856920064"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T04:10:55.613835Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1728850759680"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-31T00:10:55.615453654-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-31T00:10:55.615464774-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_001055-sr4xke8e/logs/debug-core.log"}
3
+ {"time":"2024-10-31T00:10:55.72181439-04:00","level":"INFO","msg":"created new stream","id":"sr4xke8e"}
4
+ {"time":"2024-10-31T00:10:55.7218437-04:00","level":"INFO","msg":"stream: started","id":"sr4xke8e"}
5
+ {"time":"2024-10-31T00:10:55.721915701-04:00","level":"INFO","msg":"sender: started","stream_id":"sr4xke8e"}
6
+ {"time":"2024-10-31T00:10:55.721914011-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sr4xke8e"}}
7
+ {"time":"2024-10-31T00:10:55.721899881-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sr4xke8e"}}
8
+ {"time":"2024-10-31T00:10:55.919527304-04:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241031_114700-3cqkhntc/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T15:47:00.289124Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753158594560"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241031_114700-q0d78n2b/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T15:47:00.243502Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753158594560"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-31T11:47:00.246260836-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-10-31T11:47:00.246281016-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-core.log"}
3
+ {"time":"2024-10-31T11:47:00.352833535-04:00","level":"INFO","msg":"created new stream","id":"q0d78n2b"}
4
+ {"time":"2024-10-31T11:47:00.352859865-04:00","level":"INFO","msg":"stream: started","id":"q0d78n2b"}
5
+ {"time":"2024-10-31T11:47:00.352931156-04:00","level":"INFO","msg":"sender: started","stream_id":"q0d78n2b"}
6
+ {"time":"2024-10-31T11:47:00.352897086-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q0d78n2b"}}
7
+ {"time":"2024-10-31T11:47:00.352894256-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q0d78n2b"}}
8
+ {"time":"2024-10-31T11:47:00.611011859-04:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241031_114700-q0d78n2b/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Configure stats pid to 554145
3
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug.log
10
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log
11
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():621] calling init triggers
12
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():671] starting backend
15
+ 2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-10-31 11:47:00,243 INFO MainThread:554145 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-10-31 11:47:00,243 INFO MainThread:554145 [wandb_init.py:init():688] backend started and connected
18
+ 2024-10-31 11:47:00,247 INFO MainThread:554145 [wandb_init.py:init():783] updated telemetry
19
+ 2024-10-31 11:47:00,278 INFO MainThread:554145 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-10-31 11:47:00,608 INFO MainThread:554145 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-10-31 11:47:00,696 INFO MainThread:554145 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-10-31 11:47:00,697 INFO MainThread:554145 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 0.0001}
wandb/run-20241031_122005-nip14lm6/files/config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.5
4
+ m: []
5
+ python_version: 3.9.19
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 49
12
+ - 51
13
+ - 53
14
+ - 55
15
+ - 71
16
+ - 98
17
+ "2":
18
+ - 1
19
+ - 5
20
+ - 11
21
+ - 49
22
+ - 51
23
+ - 53
24
+ - 55
25
+ - 71
26
+ - 98
27
+ "3":
28
+ - 13
29
+ - 23
30
+ - 55
31
+ "4": 3.9.19
32
+ "5": 0.18.5
33
+ "6": 4.45.1
34
+ "8":
35
+ - 5
36
+ "12": 0.18.5
37
+ "13": linux-x86_64
38
+ batch_size:
39
+ value: 3
40
+ epoch:
41
+ value: 6
42
+ lr:
43
+ value: 5e-06
44
+ perturbation:
45
+ value: reverse_full
46
+ seed:
47
+ value: 0
48
+ train_set:
49
+ value: 10M
wandb/run-20241031_122005-nip14lm6/files/output.log ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 8.51MB/s]
2
+ Downloading shards: 0%| | 0/2 [00:22<?, ?it/s]
3
+ Error in sys.excepthook:
4
+ Traceback (most recent call last):
5
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/exit_hooks.py", line 41, in exc_handler
6
+ def exc_handler(
7
+ KeyboardInterrupt
8
+
9
+ Original exception was:
10
+ Traceback (most recent call last):
11
+ File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 173, in <module>
12
+ model = AutoModelForCausalLM.from_pretrained(model_name,
13
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
14
+ return model_class.from_pretrained(
15
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
16
+ resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
17
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
18
+ cached_filename = cached_file(
19
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
20
+ resolved_file = hf_hub_download(
21
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
22
+ return f(*args, **kwargs)
23
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
24
+ return fn(*args, **kwargs)
25
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
26
+ return _hf_hub_download_to_cache_dir(
27
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
28
+ with WeakFileLock(lock_path):
29
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
30
+ return next(self.gen)
31
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
32
+ lock.acquire()
33
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
34
+ time.sleep(poll_interval)
35
+ KeyboardInterrupt
wandb/run-20241031_122005-nip14lm6/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-10-31T16:20:05.846194Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_full",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753159847936"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":23}}
wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb ADDED
File without changes
wandb/run-20241101_012613-k6o0lha8/files/output.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in <module>
3
+ dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
4
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset
5
+ builder_instance = load_dataset_builder(
6
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder
7
+ builder_instance: DatasetBuilder = builder_cls(
8
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__
9
+ self.config, self.config_id = self._create_builder_config(
10
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config
11
+ raise ValueError(
12
+ ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0']
wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-01T05:26:13.051361Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nodeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753992237056"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb ADDED
Binary file (3.43 kB). View file
 
wandb/run-20241101_012733-9v55tr72/files/output.log ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:04<00:00, 225385.84it/s]
2
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:00<00:00, 2536102.07it/s]
3
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16426/16426 [00:00<00:00, 27033.16it/s]
4
+ Generating train split: 16425 examples [00:08, 1830.77 examples/s]█████████████████████████████████████████████████████████ | 14150/16426 [00:00<00:00, 29025.68it/s]
5
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:05<00:00, 206269.15it/s]
6
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:00<00:00, 2626307.34it/s]
7
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17014/17014 [00:00<00:00, 24718.45it/s]
8
+ Generating validation split: 17013 examples [00:10, 1685.09 examples/s]█████████████████████████████████▏ | 12320/17014 [00:00<00:00, 26810.27it/s]
9
+ 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:04<00:00, 250719.52it/s]
10
+ 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:00<00:00, 3139247.02it/s]
11
+ 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15439/15439 [00:00<00:00, 27030.75it/s]
12
+ Generating test split: 15438 examples [00:08, 1840.00 examples/s]█████████████████████████████████████████████████████████████▉ | 13736/15439 [00:00<00:00, 34826.46it/s]
13
+ Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.36s/it]
14
+ Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.45s/it]
15
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:54<00:00, 303.11 examples/s]
16
+ Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:57<00:00, 297.31 examples/s]
17
+ tokenized_valid: Dataset({
18
+ features: ['input_ids', 'attention_mask'],
19
+ num_rows: 600
20
+ })
21
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
22
+ warnings.warn(
23
+ [2024-11-01 01:32:35,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
+ [2024-11-01 01:32:46,120] [INFO] [comm.py:652:init_distributed] cdb=None
25
+ [2024-11-01 01:32:46,120] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
26
+ Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
27
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
28
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
29
+ Loading extension module cpu_adam...
30
+ Time to load cpu_adam op: 5.5455732345581055 seconds
31
+ wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
32
+
33
+ {'loss': 3.0928, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0}
34
+ {'loss': 3.0737, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0}
35
+ {'loss': 3.1154, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
36
+ {'loss': 3.1109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
37
+ {'loss': 3.1179, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
38
+ {'loss': 3.089, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
39
+ {'loss': 3.1042, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
40
+ {'loss': 3.109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
41
+ {'loss': 3.097, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
42
+ {'loss': 3.1119, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
43
+
44
+ {'eval_loss': 3.1238040924072266, 'eval_runtime': 12.4211, 'eval_samples_per_second': 48.305, 'eval_steps_per_second': 1.047, 'epoch': 0.02}
45
+ {'loss': 3.0899, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
46
+ {'loss': 3.1001, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
47
+ {'loss': 3.118, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
48
+ {'loss': 3.1069, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
49
+ {'loss': 3.0758, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.03}
50
+ {'loss': 3.0993, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.04}
51
+ {'loss': 3.0696, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04}
52
+ {'loss': 3.1161, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04}
53
+ {'loss': 3.0935, 'grad_norm': 8.783886909484863, 'learning_rate': 4.9945175438596495e-06, 'epoch': 0.04}
54
+ {'loss': 3.0074, 'grad_norm': 5.36458158493042, 'learning_rate': 4.992690058479532e-06, 'epoch': 0.04}
55
+ {'eval_loss': 2.966599225997925, 'eval_runtime': 11.5387, 'eval_samples_per_second': 51.999, 'eval_steps_per_second': 1.127, 'epoch': 0.04}
56
+ {'loss': 2.9491, 'grad_norm': 4.384786605834961, 'learning_rate': 4.990862573099415e-06, 'epoch': 0.05}
57
+ {'loss': 2.919, 'grad_norm': 5.37711238861084, 'learning_rate': 4.989035087719299e-06, 'epoch': 0.05}
58
+ {'loss': 2.8708, 'grad_norm': 4.1505208015441895, 'learning_rate': 4.987207602339182e-06, 'epoch': 0.05}
59
+ {'loss': 2.8378, 'grad_norm': 2.6863813400268555, 'learning_rate': 4.985380116959065e-06, 'epoch': 0.05}
60
+ {'loss': 2.8356, 'grad_norm': 2.7242753505706787, 'learning_rate': 4.983552631578948e-06, 'epoch': 0.05}
61
+ {'loss': 2.7966, 'grad_norm': 2.6609349250793457, 'learning_rate': 4.9817251461988304e-06, 'epoch': 0.06}
62
+ {'loss': 2.7441, 'grad_norm': 2.2204103469848633, 'learning_rate': 4.979897660818714e-06, 'epoch': 0.06}
63
+ {'loss': 2.7614, 'grad_norm': 2.374406099319458, 'learning_rate': 4.978070175438597e-06, 'epoch': 0.06}
64
+ {'loss': 2.7582, 'grad_norm': 2.696918249130249, 'learning_rate': 4.9762426900584795e-06, 'epoch': 0.06}
65
+ {'loss': 2.6845, 'grad_norm': 1.8222397565841675, 'learning_rate': 4.974415204678363e-06, 'epoch': 0.07}
66
+ {'eval_loss': 2.758892297744751, 'eval_runtime': 11.5398, 'eval_samples_per_second': 51.994, 'eval_steps_per_second': 1.127, 'epoch': 0.07}
67
+ {'loss': 2.6874, 'grad_norm': 2.076284408569336, 'learning_rate': 4.972587719298246e-06, 'epoch': 0.07}
68
+ {'loss': 2.693, 'grad_norm': 2.0412065982818604, 'learning_rate': 4.970760233918129e-06, 'epoch': 0.07}
69
+ {'loss': 2.6601, 'grad_norm': 1.8842229843139648, 'learning_rate': 4.968932748538012e-06, 'epoch': 0.07}
70
+ {'loss': 2.6749, 'grad_norm': 1.756975531578064, 'learning_rate': 4.967105263157895e-06, 'epoch': 0.07}
71
+ {'loss': 2.6141, 'grad_norm': 2.0640175342559814, 'learning_rate': 4.9652777777777786e-06, 'epoch': 0.08}
72
+ {'loss': 2.608, 'grad_norm': 1.5173723697662354, 'learning_rate': 4.963450292397661e-06, 'epoch': 0.08}
73
+ {'loss': 2.5623, 'grad_norm': 1.8280211687088013, 'learning_rate': 4.961622807017544e-06, 'epoch': 0.08}
74
+ {'loss': 2.6046, 'grad_norm': 1.990080714225769, 'learning_rate': 4.959795321637428e-06, 'epoch': 0.08}
75
+ {'loss': 2.5329, 'grad_norm': 1.4346381425857544, 'learning_rate': 4.95796783625731e-06, 'epoch': 0.09}
76
+ {'loss': 2.5307, 'grad_norm': 1.45533287525177, 'learning_rate': 4.956140350877193e-06, 'epoch': 0.09}
77
+ {'eval_loss': 2.629573345184326, 'eval_runtime': 11.5669, 'eval_samples_per_second': 51.872, 'eval_steps_per_second': 1.124, 'epoch': 0.09}
78
+ {'loss': 2.5591, 'grad_norm': 1.5484964847564697, 'learning_rate': 4.954312865497076e-06, 'epoch': 0.09}
79
+ {'loss': 2.5403, 'grad_norm': 1.3261419534683228, 'learning_rate': 4.9524853801169595e-06, 'epoch': 0.09}
80
+ {'loss': 2.5176, 'grad_norm': 1.4234470129013062, 'learning_rate': 4.950657894736843e-06, 'epoch': 0.09}
81
+ {'loss': 2.4846, 'grad_norm': 1.5438008308410645, 'learning_rate': 4.948830409356726e-06, 'epoch': 0.1}
82
+ {'loss': 2.4678, 'grad_norm': 1.4391041994094849, 'learning_rate': 4.947002923976609e-06, 'epoch': 0.1}
83
+ {'loss': 2.5105, 'grad_norm': 1.7514405250549316, 'learning_rate': 4.945175438596491e-06, 'epoch': 0.1}
84
+ {'loss': 2.5033, 'grad_norm': 1.2241393327713013, 'learning_rate': 4.943347953216375e-06, 'epoch': 0.1}
85
+ {'loss': 2.4888, 'grad_norm': 1.4796929359436035, 'learning_rate': 4.941520467836258e-06, 'epoch': 0.11}
86
+ {'loss': 2.497, 'grad_norm': 1.3036240339279175, 'learning_rate': 4.9396929824561404e-06, 'epoch': 0.11}
87
+ {'loss': 2.4566, 'grad_norm': 1.309809923171997, 'learning_rate': 4.937865497076024e-06, 'epoch': 0.11}
88
+ {'eval_loss': 2.558666944503784, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.11}
89
+ {'loss': 2.4567, 'grad_norm': 1.414117455482483, 'learning_rate': 4.936038011695907e-06, 'epoch': 0.11}
90
+ {'loss': 2.49, 'grad_norm': 1.4788432121276855, 'learning_rate': 4.9342105263157895e-06, 'epoch': 0.11}
91
+ {'loss': 2.4243, 'grad_norm': 1.4120174646377563, 'learning_rate': 4.932383040935672e-06, 'epoch': 0.12}
92
+ {'loss': 2.4309, 'grad_norm': 1.6317367553710938, 'learning_rate': 4.930555555555556e-06, 'epoch': 0.12}
93
+ {'loss': 2.4456, 'grad_norm': 1.1397351026535034, 'learning_rate': 4.9287280701754395e-06, 'epoch': 0.12}
94
+ {'loss': 2.4707, 'grad_norm': 1.6220897436141968, 'learning_rate': 4.926900584795322e-06, 'epoch': 0.12}
95
+ {'loss': 2.4705, 'grad_norm': 1.2757837772369385, 'learning_rate': 4.925073099415205e-06, 'epoch': 0.12}
96
+ {'loss': 2.4143, 'grad_norm': 1.3141602277755737, 'learning_rate': 4.9232456140350886e-06, 'epoch': 0.13}
97
+ {'loss': 2.4199, 'grad_norm': 1.5668749809265137, 'learning_rate': 4.921418128654971e-06, 'epoch': 0.13}
98
+ {'loss': 2.4307, 'grad_norm': 1.4094359874725342, 'learning_rate': 4.919590643274854e-06, 'epoch': 0.13}
99
+ {'eval_loss': 2.517282247543335, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.13}
100
+ {'loss': 2.3747, 'grad_norm': 1.752899169921875, 'learning_rate': 4.917763157894737e-06, 'epoch': 0.13}
101
+ {'loss': 2.4056, 'grad_norm': 1.739943027496338, 'learning_rate': 4.91593567251462e-06, 'epoch': 0.14}
102
+ {'loss': 2.4286, 'grad_norm': 1.6286025047302246, 'learning_rate': 4.914108187134503e-06, 'epoch': 0.14}
103
+ {'loss': 2.4063, 'grad_norm': 1.542277455329895, 'learning_rate': 4.912280701754386e-06, 'epoch': 0.14}
104
+ {'loss': 2.412, 'grad_norm': 1.8128482103347778, 'learning_rate': 4.9104532163742695e-06, 'epoch': 0.14}
105
+ {'loss': 2.4342, 'grad_norm': 1.3743454217910767, 'learning_rate': 4.908625730994152e-06, 'epoch': 0.14}
106
+ {'loss': 2.3785, 'grad_norm': 2.225510835647583, 'learning_rate': 4.906798245614036e-06, 'epoch': 0.15}
107
+ {'loss': 2.4023, 'grad_norm': 1.531154990196228, 'learning_rate': 4.904970760233919e-06, 'epoch': 0.15}
108
+ {'loss': 2.4038, 'grad_norm': 1.983007788658142, 'learning_rate': 4.903143274853801e-06, 'epoch': 0.15}
109
+ {'loss': 2.3977, 'grad_norm': 1.4333405494689941, 'learning_rate': 4.901315789473685e-06, 'epoch': 0.15}
110
+ {'eval_loss': 2.488751173019409, 'eval_runtime': 11.6055, 'eval_samples_per_second': 51.7, 'eval_steps_per_second': 1.12, 'epoch': 0.15}
111
+ {'loss': 2.381, 'grad_norm': 1.7076454162597656, 'learning_rate': 4.899488304093568e-06, 'epoch': 0.16}
112
+ {'loss': 2.3719, 'grad_norm': 1.627768874168396, 'learning_rate': 4.8976608187134504e-06, 'epoch': 0.16}
113
+ {'loss': 2.3685, 'grad_norm': 1.3088836669921875, 'learning_rate': 4.895833333333333e-06, 'epoch': 0.16}
114
+ {'loss': 2.3684, 'grad_norm': 1.7792292833328247, 'learning_rate': 4.894005847953217e-06, 'epoch': 0.16}
115
+ {'loss': 2.377, 'grad_norm': 1.4323128461837769, 'learning_rate': 4.8921783625731e-06, 'epoch': 0.16}
116
+ {'loss': 2.366, 'grad_norm': 1.5406019687652588, 'learning_rate': 4.890350877192983e-06, 'epoch': 0.17}
117
+ {'loss': 2.3304, 'grad_norm': 1.864188313484192, 'learning_rate': 4.888523391812866e-06, 'epoch': 0.17}
118
+ {'loss': 2.3666, 'grad_norm': 1.6635836362838745, 'learning_rate': 4.886695906432749e-06, 'epoch': 0.17}
119
+ {'loss': 2.3656, 'grad_norm': 1.360572099685669, 'learning_rate': 4.884868421052632e-06, 'epoch': 0.17}
120
+ {'loss': 2.3807, 'grad_norm': 1.5489475727081299, 'learning_rate': 4.883040935672515e-06, 'epoch': 0.18}
121
+ {'eval_loss': 2.464202880859375, 'eval_runtime': 11.5982, 'eval_samples_per_second': 51.732, 'eval_steps_per_second': 1.121, 'epoch': 0.18}
122
+ {'loss': 2.351, 'grad_norm': 1.4594776630401611, 'learning_rate': 4.881213450292398e-06, 'epoch': 0.18}
123
+ {'loss': 2.3653, 'grad_norm': 1.4087573289871216, 'learning_rate': 4.879385964912281e-06, 'epoch': 0.18}
124
+ {'loss': 2.3573, 'grad_norm': 2.222598075866699, 'learning_rate': 4.877558479532164e-06, 'epoch': 0.18}
125
+ {'loss': 2.4051, 'grad_norm': 1.8786218166351318, 'learning_rate': 4.875730994152047e-06, 'epoch': 0.18}
126
+ {'loss': 2.3461, 'grad_norm': 1.4465943574905396, 'learning_rate': 4.8739035087719296e-06, 'epoch': 0.19}
127
+ {'loss': 2.3144, 'grad_norm': 1.9490894079208374, 'learning_rate': 4.872076023391813e-06, 'epoch': 0.19}
128
+ {'loss': 2.3444, 'grad_norm': 1.7288326025009155, 'learning_rate': 4.870248538011697e-06, 'epoch': 0.19}
129
+ {'loss': 2.3569, 'grad_norm': 1.7530410289764404, 'learning_rate': 4.8684210526315795e-06, 'epoch': 0.19}
130
+ {'loss': 2.3743, 'grad_norm': 1.4135267734527588, 'learning_rate': 4.866593567251462e-06, 'epoch': 0.19}
131
+ {'loss': 2.3217, 'grad_norm': 1.8368803262710571, 'learning_rate': 4.864766081871346e-06, 'epoch': 0.2}
132
+ {'eval_loss': 2.450228452682495, 'eval_runtime': 11.5967, 'eval_samples_per_second': 51.739, 'eval_steps_per_second': 1.121, 'epoch': 0.2}
133
+ {'loss': 2.3678, 'grad_norm': 1.3603743314743042, 'learning_rate': 4.862938596491229e-06, 'epoch': 0.2}
134
+ {'loss': 2.3663, 'grad_norm': 1.9931479692459106, 'learning_rate': 4.861111111111111e-06, 'epoch': 0.2}
135
+ {'loss': 2.3366, 'grad_norm': 1.4983241558074951, 'learning_rate': 4.859283625730994e-06, 'epoch': 0.2}
136
+ {'loss': 2.3388, 'grad_norm': 1.9140528440475464, 'learning_rate': 4.857456140350878e-06, 'epoch': 0.21}
137
+ {'loss': 2.3373, 'grad_norm': 1.4306626319885254, 'learning_rate': 4.8556286549707604e-06, 'epoch': 0.21}
138
+ {'loss': 2.3393, 'grad_norm': 1.8524028062820435, 'learning_rate': 4.853801169590643e-06, 'epoch': 0.21}
139
+ {'loss': 2.3076, 'grad_norm': 1.4418741464614868, 'learning_rate': 4.851973684210527e-06, 'epoch': 0.21}
140
+ {'loss': 2.3428, 'grad_norm': 1.6648645401000977, 'learning_rate': 4.8501461988304095e-06, 'epoch': 0.21}
141
+ {'loss': 2.3321, 'grad_norm': 1.887403130531311, 'learning_rate': 4.848318713450293e-06, 'epoch': 0.22}
142
+ {'loss': 2.3366, 'grad_norm': 1.9936954975128174, 'learning_rate': 4.846491228070176e-06, 'epoch': 0.22}
143
+ {'eval_loss': 2.4382259845733643, 'eval_runtime': 11.6173, 'eval_samples_per_second': 51.647, 'eval_steps_per_second': 1.119, 'epoch': 0.22}
144
+ {'loss': 2.3472, 'grad_norm': 1.8773130178451538, 'learning_rate': 4.844663742690059e-06, 'epoch': 0.22}
145
+ {'loss': 2.3142, 'grad_norm': 1.8776212930679321, 'learning_rate': 4.842836257309942e-06, 'epoch': 0.22}
146
+ {'loss': 2.3106, 'grad_norm': 2.305266857147217, 'learning_rate': 4.841008771929825e-06, 'epoch': 0.23}
147
+ {'loss': 2.3276, 'grad_norm': 2.13682222366333, 'learning_rate': 4.839181286549708e-06, 'epoch': 0.23}
148
+ {'loss': 2.3762, 'grad_norm': 1.4358876943588257, 'learning_rate': 4.8373538011695905e-06, 'epoch': 0.23}
149
+ {'loss': 2.3149, 'grad_norm': 1.7932581901550293, 'learning_rate': 4.835526315789474e-06, 'epoch': 0.23}
150
+ {'loss': 2.275, 'grad_norm': 1.6192528009414673, 'learning_rate': 4.833698830409358e-06, 'epoch': 0.23}
151
+ {'loss': 2.2949, 'grad_norm': 2.0717737674713135, 'learning_rate': 4.83187134502924e-06, 'epoch': 0.24}
152
+ {'loss': 2.3171, 'grad_norm': 1.6378692388534546, 'learning_rate': 4.830043859649123e-06, 'epoch': 0.24}
153
+ {'loss': 2.3051, 'grad_norm': 1.669114112854004, 'learning_rate': 4.828216374269007e-06, 'epoch': 0.24}
154
+ {'eval_loss': 2.4258029460906982, 'eval_runtime': 11.6018, 'eval_samples_per_second': 51.716, 'eval_steps_per_second': 1.121, 'epoch': 0.24}
155
+ {'loss': 2.3045, 'grad_norm': 1.3886950016021729, 'learning_rate': 4.8263888888888895e-06, 'epoch': 0.24}
156
+ {'loss': 2.3076, 'grad_norm': 1.7219699621200562, 'learning_rate': 4.824561403508772e-06, 'epoch': 0.25}
157
+ {'loss': 2.2857, 'grad_norm': 1.4992568492889404, 'learning_rate': 4.822733918128655e-06, 'epoch': 0.25}
158
+ {'loss': 2.3096, 'grad_norm': 1.7140436172485352, 'learning_rate': 4.820906432748539e-06, 'epoch': 0.25}
159
+ {'loss': 2.3194, 'grad_norm': 1.6086301803588867, 'learning_rate': 4.819078947368421e-06, 'epoch': 0.25}
160
+ {'loss': 2.3498, 'grad_norm': 1.5028151273727417, 'learning_rate': 4.817251461988304e-06, 'epoch': 0.25}
161
+ {'loss': 2.2867, 'grad_norm': 1.7474697828292847, 'learning_rate': 4.815423976608188e-06, 'epoch': 0.26}
162
+ {'loss': 2.2662, 'grad_norm': 1.6730782985687256, 'learning_rate': 4.8135964912280704e-06, 'epoch': 0.26}
163
+ {'loss': 2.2926, 'grad_norm': 2.4538962841033936, 'learning_rate': 4.811769005847954e-06, 'epoch': 0.26}
164
+ {'loss': 2.3217, 'grad_norm': 1.6457512378692627, 'learning_rate': 4.809941520467837e-06, 'epoch': 0.26}
165
+ {'eval_loss': 2.408306121826172, 'eval_runtime': 11.6092, 'eval_samples_per_second': 51.683, 'eval_steps_per_second': 1.12, 'epoch': 0.26}
166
+ {'loss': 2.2943, 'grad_norm': 1.8019167184829712, 'learning_rate': 4.8081140350877195e-06, 'epoch': 0.27}
167
+ {'loss': 2.2945, 'grad_norm': 1.3667467832565308, 'learning_rate': 4.806286549707603e-06, 'epoch': 0.27}
168
+ {'loss': 2.2925, 'grad_norm': 1.6296675205230713, 'learning_rate': 4.804459064327486e-06, 'epoch': 0.27}
169
+ {'loss': 2.2902, 'grad_norm': 1.9956955909729004, 'learning_rate': 4.802631578947369e-06, 'epoch': 0.27}
170
+ {'loss': 2.2738, 'grad_norm': 1.850484848022461, 'learning_rate': 4.800804093567251e-06, 'epoch': 0.27}
171
+ {'loss': 2.2809, 'grad_norm': 1.682741403579712, 'learning_rate': 4.798976608187135e-06, 'epoch': 0.28}
172
+ {'loss': 2.2944, 'grad_norm': 1.5462265014648438, 'learning_rate': 4.797149122807018e-06, 'epoch': 0.28}
173
+ {'loss': 2.3055, 'grad_norm': 1.6992024183273315, 'learning_rate': 4.7953216374269005e-06, 'epoch': 0.28}
174
+ {'loss': 2.2811, 'grad_norm': 2.0903217792510986, 'learning_rate': 4.793494152046784e-06, 'epoch': 0.28}
175
+ {'loss': 2.3058, 'grad_norm': 1.9676622152328491, 'learning_rate': 4.791666666666668e-06, 'epoch': 0.28}
176
+ {'eval_loss': 2.40899658203125, 'eval_runtime': 11.5988, 'eval_samples_per_second': 51.729, 'eval_steps_per_second': 1.121, 'epoch': 0.28}
177
+ {'loss': 2.3126, 'grad_norm': 1.6649582386016846, 'learning_rate': 4.78983918128655e-06, 'epoch': 0.29}
178
+ {'loss': 2.2789, 'grad_norm': 2.4453353881835938, 'learning_rate': 4.788011695906433e-06, 'epoch': 0.29}
179
+ {'loss': 2.3292, 'grad_norm': 2.011908769607544, 'learning_rate': 4.786184210526316e-06, 'epoch': 0.29}
180
+ {'loss': 2.2913, 'grad_norm': 1.4906234741210938, 'learning_rate': 4.7843567251461995e-06, 'epoch': 0.29}
181
+ {'loss': 2.2931, 'grad_norm': 2.1027095317840576, 'learning_rate': 4.782529239766082e-06, 'epoch': 0.3}
182
+ {'loss': 2.3031, 'grad_norm': 1.4204366207122803, 'learning_rate': 4.780701754385965e-06, 'epoch': 0.3}
183
+ {'loss': 2.3196, 'grad_norm': 2.1822638511657715, 'learning_rate': 4.778874269005848e-06, 'epoch': 0.3}
184
+ {'loss': 2.2743, 'grad_norm': 1.7422493696212769, 'learning_rate': 4.777046783625731e-06, 'epoch': 0.3}
185
+ {'loss': 2.285, 'grad_norm': 1.6661350727081299, 'learning_rate': 4.775219298245615e-06, 'epoch': 0.3}
186
+ {'loss': 2.295, 'grad_norm': 1.7462584972381592, 'learning_rate': 4.773391812865498e-06, 'epoch': 0.31}
187
+ {'eval_loss': 2.4030065536499023, 'eval_runtime': 11.608, 'eval_samples_per_second': 51.688, 'eval_steps_per_second': 1.12, 'epoch': 0.31}
188
+ {'loss': 2.2568, 'grad_norm': 1.5412518978118896, 'learning_rate': 4.7715643274853804e-06, 'epoch': 0.31}
189
+ {'loss': 2.2638, 'grad_norm': 1.5836228132247925, 'learning_rate': 4.769736842105264e-06, 'epoch': 0.31}
190
+ {'loss': 2.2783, 'grad_norm': 1.7100133895874023, 'learning_rate': 4.767909356725147e-06, 'epoch': 0.31}
191
+ {'loss': 2.3331, 'grad_norm': 1.7988075017929077, 'learning_rate': 4.7660818713450295e-06, 'epoch': 0.32}
192
+ {'loss': 2.297, 'grad_norm': 1.66475510597229, 'learning_rate': 4.764254385964912e-06, 'epoch': 0.32}
193
+ {'loss': 2.2671, 'grad_norm': 1.811797857284546, 'learning_rate': 4.762426900584796e-06, 'epoch': 0.32}
194
+ {'loss': 2.2555, 'grad_norm': 1.4660041332244873, 'learning_rate': 4.760599415204679e-06, 'epoch': 0.32}
195
+ {'loss': 2.2699, 'grad_norm': 2.041257381439209, 'learning_rate': 4.758771929824561e-06, 'epoch': 0.32}
196
+ {'loss': 2.2772, 'grad_norm': 1.6798973083496094, 'learning_rate': 4.756944444444445e-06, 'epoch': 0.33}
wandb/run-20241101_012733-9v55tr72/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-01T05:27:33.891704Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "6",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1753992261632"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-11-01T01:27:33.89478546-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
2
+ {"time":"2024-11-01T01:27:33.89479698-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-core.log"}
3
+ {"time":"2024-11-01T01:27:34.000399053-04:00","level":"INFO","msg":"created new stream","id":"9v55tr72"}
4
+ {"time":"2024-11-01T01:27:34.000437843-04:00","level":"INFO","msg":"stream: started","id":"9v55tr72"}
5
+ {"time":"2024-11-01T01:27:34.000488243-04:00","level":"INFO","msg":"sender: started","stream_id":"9v55tr72"}
6
+ {"time":"2024-11-01T01:27:34.000483083-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"9v55tr72"}}
7
+ {"time":"2024-11-01T01:27:34.000469223-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"9v55tr72"}}
8
+ {"time":"2024-11-01T01:27:34.182959789-04:00","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241101_012733-9v55tr72/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Configure stats pid to 678552
3
+ 2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug.log
10
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log
11
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():671] starting backend
15
+ 2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-01 01:27:33,891 INFO MainThread:678552 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-01 01:27:33,891 INFO MainThread:678552 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-01 01:27:33,894 INFO MainThread:678552 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-01 01:27:33,912 INFO MainThread:678552 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-01 01:27:34,178 INFO MainThread:678552 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-01 01:27:34,266 INFO MainThread:678552 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-01 01:27:34,266 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
27
+ 2024-11-01 01:33:19,616 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 150, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False}
28
+ 2024-11-01 01:33:19,623 INFO MainThread:678552 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f574a0c3fa0>>
29
+ 2024-11-01 01:33:19,623 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None
wandb/run-20241101_094656-ae4hctp0/files/output.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.71s/it]
2
+ tokenized_valid: Dataset({
3
+ features: ['input_ids', 'attention_mask'],
4
+ num_rows: 600
5
+ })
6
+ /mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
7
+ warnings.warn(
8
+ [2024-11-01 09:47:04,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-11-01 09:47:14,377] [INFO] [comm.py:652:init_distributed] cdb=None
10
+ Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
11
+ Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
12
+ Loading extension module cpu_adam...
13
+ Time to load cpu_adam op: 4.517135381698608 seconds
wandb/run-20241101_094656-ae4hctp0/files/requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ funcsigs==1.0.2
2
+ sentry-sdk==2.17.0
3
+ multiprocess==0.70.16
4
+ numpy==1.26.2
5
+ pluralizer==1.2.0
6
+ debugpy==1.6.7
7
+ nvidia-cudnn-cu11==8.5.0.96
8
+ deepspeed==0.15.2
9
+ data==0.4
10
+ pandas==2.1.3
11
+ tomli==2.0.1
12
+ charset-normalizer==3.3.2
13
+ attrs==24.2.0
14
+ aiosignal==1.3.1
15
+ fsspec==2023.10.0
16
+ nvidia-cusparse-cu11==11.7.4.91
17
+ zipp==3.12.0
18
+ mypy-extensions==1.0.0
19
+ datasets==3.0.1
20
+ joblib==1.3.2
21
+ hjson==3.1.0
22
+ traitlets==5.7.1
23
+ stack-data==0.6.0
24
+ transformers==4.45.1
25
+ sympy==1.11.1
26
+ Pygments==2.15.0
27
+ docker-pycreds==0.4.0
28
+ dill==0.3.8
29
+ wheel==0.44.0
30
+ prompt-toolkit==3.0.30
31
+ parso==0.8.3
32
+ ipykernel==6.23.1
33
+ pyarrow==17.0.0
34
+ certifi==2023.11.17
35
+ nvidia-cufft-cu11==10.9.0.58
36
+ six==1.16.0
37
+ pydantic==2.9.2
38
+ click==8.1.7
39
+ nest-asyncio==1.5.6
40
+ gmpy2==2.1.0
41
+ matplotlib==3.8.2
42
+ scipy==1.11.4
43
+ typing_extensions==4.12.2
44
+ statsmodels==0.14.0
45
+ huggingface-hub==0.25.0
46
+ frozenlist==1.4.1
47
+ gpustat==1.1.1
48
+ nvidia-nvtx-cu11==11.7.91
49
+ safetensors==0.4.5
50
+ stanza==1.9.2
51
+ decorator==5.1.1
52
+ seaborn==0.13.0
53
+ sentencepiece==0.2.0
54
+ PyYAML==6.0.1
55
+ black==24.8.0
56
+ protobuf==4.25.1
57
+ pickleshare==0.7.5
58
+ peft==0.13.0
59
+ triton==2.0.0
60
+ nvidia-cuda-runtime-cu11==11.7.99
61
+ Jinja2==3.1.2
62
+ nvidia-cusolver-cu11==11.4.0.1
63
+ executing==1.2.0
64
+ jupyter_client==8.1.0
65
+ pluggy==1.3.0
66
+ cmake==3.30.3
67
+ pytz==2023.3.post1
68
+ aiohappyeyeballs==2.4.2
69
+ kiwisolver==1.4.5
70
+ py-cpuinfo==9.0.0
71
+ Pillow==10.1.0
72
+ ptyprocess==0.7.0
73
+ importlib_resources==6.4.5
74
+ GitPython==3.1.43
75
+ importlib-metadata==6.0.0
76
+ iniconfig==2.0.0
77
+ scikit-learn==1.3.2
78
+ exceptiongroup==1.1.0
79
+ networkx==2.8.6
80
+ accelerate==1.0.0
81
+ nltk==3.8.1
82
+ shutilwhich==1.1.0
83
+ fonttools==4.45.1
84
+ future==0.18.3
85
+ aiohttp==3.10.6
86
+ wcwidth==0.2.5
87
+ idna==3.6
88
+ filelock==3.12.2
89
+ pathspec==0.12.1
90
+ jupyter_core==5.1.0
91
+ lit==18.1.8
92
+ nvidia-curand-cu11==10.2.10.91
93
+ nvidia-cublas-cu11==11.10.3.66
94
+ nvidia-ml-py==12.560.30
95
+ msgpack==1.1.0
96
+ python-dateutil==2.8.2
97
+ blessed==1.20.0
98
+ packaging==23.0
99
+ gitdb==4.0.11
100
+ yarl==1.13.0
101
+ emoji==2.8.0
102
+ tzdata==2023.3
103
+ cycler==0.12.1
104
+ tornado==6.2
105
+ backcall==0.2.0
106
+ plotnine==0.12.4
107
+ ninja==1.11.1.1
108
+ latex==0.7.0
109
+ wandb==0.18.5
110
+ setproctitle==1.3.3
111
+ threadpoolctl==3.2.0
112
+ requests==2.32.3
113
+ pyparsing==3.1.1
114
+ smmap==5.0.1
115
+ pyzmq==23.0.0
116
+ async-timeout==4.0.3
117
+ annotated-types==0.7.0
118
+ matplotlib-inline==0.1.6
119
+ latexcodec==1.0.0
120
+ ipython==8.0.0
121
+ patsy==0.5.3
122
+ contourpy==1.2.0
123
+ multidict==6.1.0
124
+ mizani==0.9.3
125
+ urllib3==2.1.0
126
+ tokenizers==0.20.0
127
+ MarkupSafe==2.1.2
128
+ pip==24.2
129
+ pexpect==4.8.0
130
+ tqdm==4.66.5
131
+ jedi==0.18.2
132
+ pydantic_core==2.23.4
133
+ tempdir==0.7.1
134
+ mpmath==1.2.1
135
+ setuptools==72.1.0
136
+ pytest==7.4.3
137
+ pure-eval==0.2.2
138
+ psutil==5.9.1
139
+ comm==0.1.2
140
+ nvidia-cuda-cupti-cu11==11.7.101
141
+ nvidia-cuda-nvrtc-cu11==11.7.99
142
+ regex==2023.10.3
143
+ platformdirs==2.5.2
144
+ asttokens==2.2.1
145
+ torch==2.0.0
146
+ nvidia-nccl-cu11==2.14.3
147
+ xxhash==3.5.0
wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-01T13:46:56.380225Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "reverse_control",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "7",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1754695659520"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_094656-ae4hctp0/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
2
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Configure stats pid to 786690
3
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
4
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
5
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
6
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
7
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
8
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Applying login settings: {}
9
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug.log
10
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log
11
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():621] calling init triggers
12
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():671] starting backend
15
+ 2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():675] sending inform_init request
16
+ 2024-11-01 09:46:56,379 INFO MainThread:786690 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-11-01 09:46:56,380 INFO MainThread:786690 [wandb_init.py:init():688] backend started and connected
18
+ 2024-11-01 09:46:56,383 INFO MainThread:786690 [wandb_init.py:init():783] updated telemetry
19
+ 2024-11-01 09:46:56,411 INFO MainThread:786690 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
20
+ 2024-11-01 09:46:56,711 INFO MainThread:786690 [wandb_init.py:init():867] starting run threads in backend
21
+ 2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_console_start():2463] atexit reg
22
+ 2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_redirect():2311] redirect: wrap_raw
23
+ 2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_redirect():2376] Wrapping output streams.
24
+ 2024-11-01 09:46:56,847 INFO MainThread:786690 [wandb_run.py:_redirect():2401] Redirects installed.
25
+ 2024-11-01 09:46:56,849 INFO MainThread:786690 [wandb_init.py:init():911] run started, returning control to user process
26
+ 2024-11-01 09:46:56,849 INFO MainThread:786690 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06}
wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb ADDED
File without changes
wandb/run-20241101_200517-77b12390/files/output.log ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Downloading shards: 0%| | 0/2 [00:00<?, ?it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7f1a1c684d60>
2
+ Traceback (most recent call last):
3
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__
4
+ self.close()
5
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close
6
+ self.display(pos=0)
7
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display
8
+ self.sp(self.__str__() if msg is None else msg)
9
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 459, in print_status
10
+ fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
11
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 452, in fp_write
12
+ fp.write(str(s))
13
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner
14
+ return func(*args, **kwargs)
15
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write
16
+ cb(data)
17
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in <lambda>
18
+ lambda data: self._console_raw_callback("stderr", data),
19
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 403, in wrapper_fn
20
+ return func(self, *args, **kwargs)
21
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1547, in _console_raw_callback
22
+ self._backend.interface.publish_output_raw(name, data)
23
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 721, in publish_output_raw
24
+ self._publish_output_raw(o)
25
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 79, in _publish_output_raw
26
+ self._publish(rec)
27
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 50, in _publish
28
+ self._assign(record)
29
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 45, in _assign
30
+ def _assign(self, record: Any) -> None:
31
+ KeyboardInterrupt:
32
+ Traceback (most recent call last):
33
+ File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 173, in <module>
34
+ model = AutoModelForCausalLM.from_pretrained(model_name,
35
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
36
+ return model_class.from_pretrained(
37
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
38
+ resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
39
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
40
+ cached_filename = cached_file(
41
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
42
+ resolved_file = hf_hub_download(
43
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
44
+ return f(*args, **kwargs)
45
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
46
+ return fn(*args, **kwargs)
47
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
48
+ return _hf_hub_download_to_cache_dir(
49
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
50
+ with WeakFileLock(lock_path):
51
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
52
+ return next(self.gen)
53
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
54
+ lock.acquire()
55
+ File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
56
+ time.sleep(poll_interval)
57
+ KeyboardInterrupt
wandb/run-20241101_200517-77b12390/files/wandb-metadata.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
3
+ "python": "3.9.19",
4
+ "startedAt": "2024-11-02T00:05:17.462510Z",
5
+ "args": [
6
+ "--perturbation",
7
+ "shuffle_nondeterministic",
8
+ "--train_set",
9
+ "10M",
10
+ "--batch_size",
11
+ "3",
12
+ "--epoch",
13
+ "3",
14
+ "--seed",
15
+ "0"
16
+ ],
17
+ "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
18
+ "codePath": "train/train_deep_wandb.py",
19
+ "git": {
20
+ "remote": "git@hf.co:Yaning1001/Impossible_llm.git",
21
+ "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
22
+ },
23
+ "email": "yaning1001@gmail.com",
24
+ "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
25
+ "host": "mms-large-2",
26
+ "username": "chunhui",
27
+ "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
28
+ "codePathLocal": "train_deep_wandb.py",
29
+ "cpu_count": 32,
30
+ "cpu_count_logical": 64,
31
+ "gpu": "NVIDIA RTX A6000",
32
+ "gpu_count": 8,
33
+ "disk": {
34
+ "/": {
35
+ "total": "1888559353856",
36
+ "used": "1754801557504"
37
+ }
38
+ },
39
+ "memory": {
40
+ "total": "202617098240"
41
+ },
42
+ "cpu": {
43
+ "count": 32,
44
+ "countLogical": 64
45
+ },
46
+ "gpu_nvidia": [
47
+ {
48
+ "name": "NVIDIA RTX A6000",
49
+ "memoryTotal": "51527024640",
50
+ "cudaCores": 10752,
51
+ "architecture": "Ampere"
52
+ },
53
+ {
54
+ "name": "NVIDIA RTX A6000",
55
+ "memoryTotal": "51527024640",
56
+ "cudaCores": 10752,
57
+ "architecture": "Ampere"
58
+ },
59
+ {
60
+ "name": "NVIDIA RTX A6000",
61
+ "memoryTotal": "51527024640",
62
+ "cudaCores": 10752,
63
+ "architecture": "Ampere"
64
+ },
65
+ {
66
+ "name": "NVIDIA RTX A6000",
67
+ "memoryTotal": "51527024640",
68
+ "cudaCores": 10752,
69
+ "architecture": "Ampere"
70
+ },
71
+ {
72
+ "name": "NVIDIA RTX A6000",
73
+ "memoryTotal": "51527024640",
74
+ "cudaCores": 10752,
75
+ "architecture": "Ampere"
76
+ },
77
+ {
78
+ "name": "NVIDIA RTX A6000",
79
+ "memoryTotal": "51527024640",
80
+ "cudaCores": 10752,
81
+ "architecture": "Ampere"
82
+ },
83
+ {
84
+ "name": "NVIDIA RTX A6000",
85
+ "memoryTotal": "51527024640",
86
+ "cudaCores": 10752,
87
+ "architecture": "Ampere"
88
+ },
89
+ {
90
+ "name": "NVIDIA RTX A6000",
91
+ "memoryTotal": "51527024640",
92
+ "cudaCores": 10752,
93
+ "architecture": "Ampere"
94
+ }
95
+ ],
96
+ "cudaVersion": "11.8"
97
+ }
wandb/run-20241101_200517-77b12390/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":7}}