Yuanhan Mo commited on
Commit
53198fa
·
1 Parent(s): be5d479

Sync latest local changes before HF migration

Browse files
Config/config_om_contrastive.yaml CHANGED
@@ -17,7 +17,7 @@ timesteps: 80
17
  v_scale: 5.0e-05
18
  # =========================
19
  # TRAINING SETTING
20
- epoch: 10000
21
  epoch_per_save: 1
22
  lr: 0.00001
23
  noise_scale: 0.1
 
17
  v_scale: 5.0e-05
18
  # =========================
19
  # TRAINING SETTING
20
+ epoch: 100
21
  epoch_per_save: 1
22
  lr: 0.00001
23
  noise_scale: 0.1
check_xpu.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ print(f"PyTorch version: {torch.__version__}")
3
+
4
+ try:
5
+ import intel_extension_for_pytorch as ipex
6
+ print(f"IPEX version: {ipex.__version__}")
7
+ except ImportError:
8
+ print("IPEX not installed")
9
+
10
+ if hasattr(torch, 'xpu') and torch.xpu.is_available():
11
+ count = torch.xpu.device_count()
12
+ print(f"XPU available: {count} device(s)")
13
+ for i in range(count):
14
+ print(f" XPU {i}: {torch.xpu.get_device_name(i)}")
15
+ else:
16
+ print("XPU not available")
17
+
18
+ if torch.cuda.is_available():
19
+ print(f"CUDA available: {torch.cuda.device_count()} device(s)")
20
+ else:
21
+ print("CUDA not available")
run_xpu_test.slurm ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash -l
2
+ #SBATCH --job-name=test-xpu
3
+ #SBATCH --account=AIRR-P51-DAWN-GPU
4
+ #SBATCH --partition=pvc9
5
+ #SBATCH --nodes=1
6
+ #SBATCH --gres=gpu:1
7
+ #SBATCH -n 1
8
+ #SBATCH --time=10:0:00
9
+ #SBATCH --output=test_xpu_%j.out
10
+ #SBATCH --error=test_xpu_%j.err
11
+
12
+ . /etc/profile.d/modules.sh
13
+ module purge
14
+ module load rhel9/default-dawn
15
+
16
+ conda activate pytorch-xpu
17
+
18
+ python OM_contrastive_xpu.py
test_xpu_21970354.err ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flatpak: /home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/libcrypto.so.3: version `OPENSSL_3.4.0' not found (required by /lib64/libostree-1.so.1)
2
+ flatpak: /home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/libcrypto.so.3: version `OPENSSL_3.4.0' not found (required by /lib64/librpmio.so.9)
3
+ Loading rhel9/default-dawn
4
+ Loading requirement: rhel9/global rhel9/slurm dawn-env-rhel9/2025-03-23
5
+ /home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/python3.11/site-packages/torch/functional.py:539: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:3637.)
6
+ return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
7
+ Traceback (most recent call last):
8
+ File "/home/dn-mo1/projects/OmniMorph/OM_contrastive_xpu.py", line 64, in <module>
9
+ loss.backward()
10
+ File "/home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/python3.11/site-packages/torch/_tensor.py", line 626, in backward
11
+ torch.autograd.backward(
12
+ File "/home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/python3.11/site-packages/torch/autograd/__init__.py", line 347, in backward
13
+ _engine_run_backward(
14
+ File "/home/dn-mo1/miniconda3/envs/pytorch-xpu/lib/python3.11/site-packages/torch/autograd/graph.py", line 823, in _engine_run_backward
15
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
16
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17
+ RuntimeError: UR backend failed. UR backend returns:40 (UR_RESULT_ERROR_OUT_OF_RESOURCES)
test_xpu_21970354.out ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Using XPU device: Intel(R) Data Center GPU Max 1550
2
+ Start training on xpu with 100 dummy samples...
3
+ Batch 0000 | Loss: 1.000213 | Time: 9.54s
4
+ Batch 0001 | Loss: 0.962420 | Time: 0.88s
5
+ Batch 0002 | Loss: 1.005829 | Time: 0.88s
6
+ Batch 0003 | Loss: 0.993539 | Time: 0.88s
7
+ Batch 0004 | Loss: 1.005484 | Time: 0.88s
8
+ Batch 0005 | Loss: 1.005423 | Time: 0.88s
9
+ Batch 0006 | Loss: 0.958194 | Time: 0.88s
10
+ Batch 0007 | Loss: 1.024623 | Time: 0.88s
11
+ Batch 0008 | Loss: 1.006071 | Time: 0.88s
12
+ Batch 0009 | Loss: 0.989744 | Time: 0.88s
13
+ Batch 0010 | Loss: 1.035000 | Time: 0.88s
14
+ Batch 0011 | Loss: 1.003231 | Time: 0.88s
15
+ Batch 0012 | Loss: 0.968743 | Time: 0.88s
16
+ Batch 0013 | Loss: 1.038968 | Time: 0.88s
17
+ Batch 0014 | Loss: 0.980730 | Time: 0.88s
18
+ Batch 0015 | Loss: 0.982589 | Time: 0.88s
19
+ Batch 0016 | Loss: 1.065865 | Time: 0.88s
20
+ Batch 0017 | Loss: 0.948056 | Time: 0.88s
21
+ Batch 0018 | Loss: 0.958939 | Time: 0.88s
22
+ Batch 0019 | Loss: 1.023215 | Time: 0.88s
23
+ Batch 0020 | Loss: 1.012936 | Time: 0.88s
24
+ Batch 0021 | Loss: 0.992295 | Time: 0.88s
25
+ Batch 0022 | Loss: 1.012918 | Time: 0.88s
26
+ Batch 0023 | Loss: 0.979076 | Time: 0.88s
27
+ Batch 0024 | Loss: 0.958474 | Time: 0.88s
28
+ Batch 0025 | Loss: 1.016140 | Time: 0.88s
29
+ Batch 0026 | Loss: 1.040582 | Time: 0.88s
30
+ Batch 0027 | Loss: 1.020266 | Time: 0.88s
31
+ Batch 0028 | Loss: 0.972614 | Time: 0.88s
32
+ Batch 0029 | Loss: 1.027431 | Time: 0.88s
33
+ Batch 0030 | Loss: 0.978822 | Time: 0.88s
34
+ Batch 0031 | Loss: 1.026631 | Time: 0.88s
35
+ Batch 0032 | Loss: 1.005886 | Time: 0.88s
36
+ Batch 0033 | Loss: 1.035356 | Time: 0.88s
37
+ Batch 0034 | Loss: 1.023209 | Time: 0.88s
38
+ Batch 0035 | Loss: 1.000738 | Time: 0.88s
39
+ Batch 0036 | Loss: 1.015465 | Time: 0.88s
40
+ Batch 0037 | Loss: 0.967925 | Time: 0.88s
41
+ Batch 0038 | Loss: 0.958589 | Time: 0.88s
42
+ Batch 0039 | Loss: 0.977607 | Time: 0.88s
43
+ Batch 0040 | Loss: 1.003756 | Time: 0.88s
44
+ Batch 0041 | Loss: 0.975394 | Time: 0.88s
45
+ Batch 0042 | Loss: 0.987985 | Time: 0.88s
46
+ Batch 0043 | Loss: 0.969551 | Time: 0.88s
47
+ Batch 0044 | Loss: 0.961935 | Time: 0.88s
48
+ Batch 0045 | Loss: 0.995578 | Time: 0.88s
49
+ Batch 0046 | Loss: 0.949855 | Time: 0.88s
50
+ Batch 0047 | Loss: 0.928528 | Time: 0.88s
51
+ Batch 0048 | Loss: 0.965792 | Time: 0.88s
52
+ Batch 0049 | Loss: 0.971804 | Time: 0.88s
53
+ Batch 0050 | Loss: 0.997860 | Time: 0.88s
54
+ Batch 0051 | Loss: 1.005639 | Time: 0.88s
55
+ Batch 0052 | Loss: 0.970109 | Time: 0.88s
56
+ Batch 0053 | Loss: 0.977073 | Time: 0.88s
57
+ Batch 0054 | Loss: 1.027979 | Time: 0.88s
58
+ Batch 0055 | Loss: 1.021092 | Time: 0.88s
59
+ Batch 0056 | Loss: 0.969419 | Time: 0.88s
60
+ Batch 0057 | Loss: 0.989386 | Time: 0.88s
61
+ Batch 0058 | Loss: 0.966944 | Time: 0.88s
62
+ Batch 0059 | Loss: 1.010630 | Time: 0.88s
63
+ Batch 0060 | Loss: 1.001417 | Time: 0.88s
64
+ Batch 0061 | Loss: 1.022217 | Time: 0.88s
65
+ Batch 0062 | Loss: 0.998043 | Time: 0.88s
66
+ Batch 0063 | Loss: 1.035445 | Time: 0.88s
67
+ Batch 0064 | Loss: 1.004846 | Time: 0.88s
68
+ Batch 0065 | Loss: 1.030756 | Time: 0.88s
69
+ Batch 0066 | Loss: 1.041049 | Time: 0.88s
70
+ Batch 0067 | Loss: 0.995926 | Time: 0.88s
71
+ Batch 0068 | Loss: 1.027191 | Time: 0.88s
72
+ Batch 0069 | Loss: 0.984270 | Time: 0.88s
73
+ Batch 0070 | Loss: 1.040981 | Time: 0.88s
74
+ Batch 0071 | Loss: 1.065749 | Time: 0.88s
75
+ Batch 0072 | Loss: 1.014435 | Time: 0.88s
76
+ Batch 0073 | Loss: 0.960895 | Time: 0.89s
77
+ Batch 0074 | Loss: 1.025054 | Time: 0.88s
78
+ Batch 0075 | Loss: 0.944527 | Time: 0.88s
79
+ Batch 0076 | Loss: 0.977657 | Time: 0.88s
80
+ Batch 0077 | Loss: 1.032249 | Time: 0.88s
81
+ Batch 0078 | Loss: 0.955370 | Time: 0.88s
82
+ Batch 0079 | Loss: 0.973704 | Time: 0.88s
83
+ Batch 0080 | Loss: 1.081240 | Time: 0.88s
84
+ Batch 0081 | Loss: 1.004182 | Time: 0.88s
85
+ Batch 0082 | Loss: 1.034096 | Time: 0.88s
86
+ Batch 0083 | Loss: 0.975057 | Time: 0.88s
87
+ Batch 0084 | Loss: 1.023466 | Time: 0.88s
88
+ Batch 0085 | Loss: 0.970958 | Time: 0.88s
89
+ Batch 0086 | Loss: 1.032462 | Time: 0.88s
90
+ Batch 0087 | Loss: 1.038806 | Time: 0.88s
91
+ Batch 0088 | Loss: 1.023562 | Time: 0.88s
92
+ Batch 0089 | Loss: 1.005098 | Time: 0.88s
93
+ Batch 0090 | Loss: 0.958804 | Time: 0.88s
94
+ Batch 0091 | Loss: 1.017045 | Time: 0.88s
95
+ Batch 0092 | Loss: 0.950136 | Time: 0.88s
96
+ Batch 0093 | Loss: 0.978997 | Time: 0.88s
97
+ Batch 0094 | Loss: 0.982369 | Time: 0.88s
98
+ Batch 0095 | Loss: 1.020129 | Time: 0.88s
99
+ Batch 0096 | Loss: 1.011997 | Time: 0.88s
100
+ Batch 0097 | Loss: 1.006916 | Time: 0.88s
101
+ Batch 0098 | Loss: 0.929710 | Time: 0.88s
102
+ Batch 0099 | Loss: 0.980592 | Time: 0.88s
103
+ Epoch 0000 | Avg Loss: 0.997835
104
+ Batch 0000 | Loss: 1.012112 | Time: 0.88s
105
+ Batch 0001 | Loss: 1.033262 | Time: 0.88s
106
+ Batch 0002 | Loss: 1.027011 | Time: 0.88s
107
+ Batch 0003 | Loss: 0.992161 | Time: 0.88s
108
+ Batch 0004 | Loss: 1.036357 | Time: 0.88s
109
+ Batch 0005 | Loss: 0.970923 | Time: 0.88s
110
+ Batch 0006 | Loss: 0.953751 | Time: 0.88s
111
+ Batch 0007 | Loss: 0.975788 | Time: 0.88s
112
+ Batch 0008 | Loss: 0.971225 | Time: 0.88s
113
+ Batch 0009 | Loss: 1.020594 | Time: 0.88s
114
+ Batch 0010 | Loss: 1.046801 | Time: 0.88s
115
+ Batch 0011 | Loss: 0.983430 | Time: 0.88s
116
+ Batch 0012 | Loss: 1.025952 | Time: 0.88s
117
+ Batch 0013 | Loss: 0.997054 | Time: 0.88s
118
+ Batch 0014 | Loss: 0.937820 | Time: 0.88s
119
+ Batch 0015 | Loss: 0.967805 | Time: 0.88s
120
+ Batch 0016 | Loss: 0.999889 | Time: 0.88s
121
+ Batch 0017 | Loss: 0.934704 | Time: 0.88s
122
+ Batch 0018 | Loss: 1.016255 | Time: 0.88s
123
+ Batch 0019 | Loss: 0.979338 | Time: 0.88s
124
+ Batch 0020 | Loss: 0.956785 | Time: 0.88s
125
+ Batch 0021 | Loss: 1.062752 | Time: 0.88s
126
+ Batch 0022 | Loss: 0.985927 | Time: 0.88s
127
+ Batch 0023 | Loss: 0.979186 | Time: 0.88s
128
+ Batch 0024 | Loss: 1.024652 | Time: 0.88s
129
+ Batch 0025 | Loss: 1.010498 | Time: 0.88s
130
+ Batch 0026 | Loss: 0.976131 | Time: 0.88s
131
+ Batch 0027 | Loss: 0.969373 | Time: 0.88s
132
+ Batch 0028 | Loss: 0.996122 | Time: 0.88s
133
+ Batch 0029 | Loss: 1.000917 | Time: 0.88s
134
+ Batch 0030 | Loss: 1.013310 | Time: 0.88s
135
+ Batch 0031 | Loss: 0.948228 | Time: 0.88s
136
+ Batch 0032 | Loss: 0.960574 | Time: 0.88s
137
+ Batch 0033 | Loss: 1.034608 | Time: 0.88s
138
+ Batch 0034 | Loss: 1.036931 | Time: 0.88s
139
+ Batch 0035 | Loss: 1.064918 | Time: 0.88s
140
+ Batch 0036 | Loss: 0.958324 | Time: 0.88s
141
+ Batch 0037 | Loss: 1.014204 | Time: 0.88s
142
+ Batch 0038 | Loss: 1.016461 | Time: 0.88s
143
+ Batch 0039 | Loss: 1.015172 | Time: 0.88s
144
+ Batch 0040 | Loss: 0.955644 | Time: 0.88s
145
+ Batch 0041 | Loss: 0.997143 | Time: 0.88s
146
+ Batch 0042 | Loss: 0.991827 | Time: 0.88s
147
+ Batch 0043 | Loss: 0.997436 | Time: 0.88s
148
+ Batch 0044 | Loss: 0.984573 | Time: 0.88s
149
+ Batch 0045 | Loss: 1.017988 | Time: 0.88s
150
+ Batch 0046 | Loss: 0.978219 | Time: 0.88s
151
+ Batch 0047 | Loss: 1.010932 | Time: 0.88s
152
+ Batch 0048 | Loss: 1.031496 | Time: 0.89s
153
+ Batch 0049 | Loss: 1.004478 | Time: 0.88s
154
+ Batch 0050 | Loss: 0.994104 | Time: 0.88s
155
+ Batch 0051 | Loss: 0.967657 | Time: 0.88s
156
+ Batch 0052 | Loss: 0.988423 | Time: 0.88s
157
+ Batch 0053 | Loss: 0.989502 | Time: 0.88s
158
+ Batch 0054 | Loss: 1.017574 | Time: 0.88s
159
+ Batch 0055 | Loss: 0.999798 | Time: 0.88s
160
+ Batch 0056 | Loss: 0.968825 | Time: 0.88s
161
+ Batch 0057 | Loss: 1.001009 | Time: 0.88s
162
+ Batch 0058 | Loss: 1.019638 | Time: 0.88s
163
+ Batch 0059 | Loss: 1.001356 | Time: 0.88s
164
+ Batch 0060 | Loss: 1.048746 | Time: 0.88s
165
+ Batch 0061 | Loss: 1.029195 | Time: 0.88s
166
+ Batch 0062 | Loss: 1.007311 | Time: 0.88s
167
+ Batch 0063 | Loss: 0.987915 | Time: 0.88s
168
+ Batch 0064 | Loss: 1.040163 | Time: 0.88s
169
+ Batch 0065 | Loss: 0.970650 | Time: 0.88s
170
+ Batch 0066 | Loss: 1.023372 | Time: 0.88s
171
+ Batch 0067 | Loss: 0.967705 | Time: 0.88s
172
+ Batch 0068 | Loss: 1.019943 | Time: 0.88s
173
+ Batch 0069 | Loss: 1.094197 | Time: 0.88s
174
+ Batch 0070 | Loss: 1.027255 | Time: 0.89s
175
+ Batch 0071 | Loss: 0.952136 | Time: 0.88s
176
+ Batch 0072 | Loss: 0.961200 | Time: 0.88s
177
+ Batch 0073 | Loss: 0.979057 | Time: 0.88s
178
+ Batch 0074 | Loss: 1.012684 | Time: 0.89s
179
+ Batch 0075 | Loss: 1.008096 | Time: 0.88s
180
+ Batch 0076 | Loss: 0.986181 | Time: 0.88s
181
+ Batch 0077 | Loss: 1.041546 | Time: 0.88s
182
+ Batch 0078 | Loss: 0.963106 | Time: 0.88s
183
+ Batch 0079 | Loss: 0.999396 | Time: 0.88s
184
+ Batch 0080 | Loss: 1.009895 | Time: 0.88s
185
+ Batch 0081 | Loss: 1.018401 | Time: 0.88s
186
+ Batch 0082 | Loss: 0.999254 | Time: 0.88s
187
+ Batch 0083 | Loss: 0.960079 | Time: 0.88s