English
Blinorot commited on
Commit
179eefc
·
verified ·
1 Parent(s): 80b8a67

Upload selected files from lensless_exps subfolders

Browse files
32x32_librispeech_mse_PSF_Unet4M_U5_Unet4M/checkpoint-epoch100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2809709a77d519fae6b2ba893d2d09748faf5373a2110cbb688fb1375fff2a6f
3
+ size 97418362
32x32_librispeech_mse_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_librispeech_mse_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: 1r070ti9
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.LibrispeechDataset
176
+ max_audio_length: 3
177
+ part: train-clean-100
178
+ roi_kwargs: ${reconstruction.roi_kwargs}
179
+ codec_name: ${codec.codec_name}
180
+ lensless_tag: measurement
181
+ instance_transforms: ${transforms.instance_transforms.train}
182
+ sim_psf_config: ${psf}
183
+ test:
184
+ _target_: src.datasets.LibrispeechDataset
185
+ limit: 1
186
+ max_audio_length: 3
187
+ part: test-clean
188
+ roi_kwargs: ${reconstruction.roi_kwargs}
189
+ codec_name: ${codec.codec_name}
190
+ lensless_tag: measurement
191
+ instance_transforms: ${transforms.instance_transforms.inference}
192
+ sim_psf_config: ${psf}
193
+ music_test:
194
+ _target_: src.datasets.SongDescriberDataset
195
+ limit: 1
196
+ part: test
197
+ roi_kwargs: ${reconstruction.roi_kwargs}
198
+ codec_name: ${codec.codec_name}
199
+ lensless_tag: measurement
200
+ instance_transforms: ${transforms.instance_transforms.inference}
201
+ sim_psf_config: ${psf}
202
+ dataloader:
203
+ train:
204
+ _target_: torch.utils.data.DataLoader
205
+ batch_size: 1
206
+ num_workers: 2
207
+ pin_memory: true
208
+ inference:
209
+ _target_: torch.utils.data.DataLoader
210
+ batch_size: 1
211
+ num_workers: 2
212
+ pin_memory: true
213
+ transforms:
214
+ instance_transforms:
215
+ train:
216
+ all:
217
+ _target_: torchvision.transforms.v2.Compose
218
+ transforms:
219
+ - _target_: src.transforms.PadCrop
220
+ length: 4
221
+ pad_format: replicated
222
+ random_crop: true
223
+ inference: null
224
+ batch_transforms:
225
+ train: null
226
+ inference: null
227
+ codec:
228
+ _target_: src.transforms.CodecEncoderDecoder
229
+ codec_cls: ${resolve_class:dac.DAC}
230
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
231
+ codec_add_root_path: true
232
+ codec_kwargs: null
233
+ codec_name: 32x32_120_16khz_original
234
+ eval_mode: true
235
+ freeze_weights: true
236
+ reconstruction:
237
+ roi_kwargs:
238
+ top_left:
239
+ - 65
240
+ - 118
241
+ height: 256
242
+ width: 256
243
+ resize_coef: 8
244
+ normalize_lensless: true
245
+ corners_list: null
246
+ psf:
247
+ slm: adafruit
248
+ sensor: rpi_hq
249
+ downsample: 8
250
+ rotate: -0.8
251
+ vertical_shift: -20
252
+ horizontal_shift: -20
253
+ flipud: true
254
+ use_waveprop: true
255
+ deadspace: true
256
+ scene2mask: 0.3
257
+ mask2sensor: 0.004
258
+ grayscale: true
259
+ lr_scheduler:
260
+ _target_: torch.optim.lr_scheduler.ConstantLR
261
+ factor: 1
262
+ optimizer:
263
+ _target_: torch.optim.Adam
264
+ lr: 0.0001
265
+ loss_function:
266
+ _target_: src.loss.ReconstructionLoss
267
+ codec_mse_coef: 1
268
+ codec_ssim_coef: 0
269
+ codec_gmsd_coef: 0
270
+ raw_codec_ssim_coef: 0
271
+ raw_codec_l1_coef: 0
272
+ audio_l1_coef: 0
273
+ audio_sisdr_coef: 0
274
+ audio_stft_coef: 0
275
+ audio_mel_coef: 0
276
+ audio_stft_config:
277
+ window_lengths:
278
+ - 2048
279
+ - 512
280
+ audio_mel_config:
281
+ n_mels:
282
+ - 5
283
+ - 10
284
+ - 20
285
+ - 40
286
+ window_lengths:
287
+ - 32
288
+ - 64
289
+ - 128
290
+ - 256
291
+ mel_fmin:
292
+ - 0
293
+ - 0
294
+ - 0
295
+ - 0
296
+ mel_fmax:
297
+ - null
298
+ - null
299
+ - null
300
+ - null
301
+ pow: 1.0
302
+ clamp_eps: 1.0e-05
303
+ mag_weight: 0.0
304
+ resize_coef: ${reconstruction.resize_coef}
305
+ ssim_kernel: 7
306
+ ssim_sigma: 0.5
307
+ raw_ssim_kernel: 11
308
+ trainer:
309
+ log_step: 50
310
+ n_epochs: 100
311
+ epoch_len: 500
312
+ device_tensors:
313
+ - lensless_codec_video
314
+ - lensed_codec_video
315
+ - lensless_psf
316
+ - audio
317
+ resume_from: null
318
+ device: auto
319
+ override: true
320
+ monitor: max test_PSNR
321
+ save_period: 5
322
+ early_stop: ${trainer.n_epochs}
323
+ save_dir: saved
324
+ seed: 1
32x32_librispeech_mse_ssim_PSF_Unet4M_U5_Unet4M/checkpoint-epoch100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31f9b2638fbac8a9a872573ee632c2dd2afa875958ad68a072898967b3543d08
3
+ size 97418426
32x32_librispeech_mse_ssim_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_librispeech_mse_ssim_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: y1ya028x
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.LibrispeechDataset
176
+ max_audio_length: 3
177
+ part: train-clean-100
178
+ roi_kwargs: ${reconstruction.roi_kwargs}
179
+ codec_name: ${codec.codec_name}
180
+ lensless_tag: measurement
181
+ instance_transforms: ${transforms.instance_transforms.train}
182
+ sim_psf_config: ${psf}
183
+ test:
184
+ _target_: src.datasets.LibrispeechDataset
185
+ limit: 1
186
+ max_audio_length: 3
187
+ part: test-clean
188
+ roi_kwargs: ${reconstruction.roi_kwargs}
189
+ codec_name: ${codec.codec_name}
190
+ lensless_tag: measurement
191
+ instance_transforms: ${transforms.instance_transforms.inference}
192
+ sim_psf_config: ${psf}
193
+ music_test:
194
+ _target_: src.datasets.SongDescriberDataset
195
+ limit: 1
196
+ part: test
197
+ roi_kwargs: ${reconstruction.roi_kwargs}
198
+ codec_name: ${codec.codec_name}
199
+ lensless_tag: measurement
200
+ instance_transforms: ${transforms.instance_transforms.inference}
201
+ sim_psf_config: ${psf}
202
+ dataloader:
203
+ train:
204
+ _target_: torch.utils.data.DataLoader
205
+ batch_size: 1
206
+ num_workers: 2
207
+ pin_memory: true
208
+ inference:
209
+ _target_: torch.utils.data.DataLoader
210
+ batch_size: 1
211
+ num_workers: 2
212
+ pin_memory: true
213
+ transforms:
214
+ instance_transforms:
215
+ train:
216
+ all:
217
+ _target_: torchvision.transforms.v2.Compose
218
+ transforms:
219
+ - _target_: src.transforms.PadCrop
220
+ length: 4
221
+ pad_format: replicated
222
+ random_crop: true
223
+ inference: null
224
+ batch_transforms:
225
+ train: null
226
+ inference: null
227
+ codec:
228
+ _target_: src.transforms.CodecEncoderDecoder
229
+ codec_cls: ${resolve_class:dac.DAC}
230
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
231
+ codec_add_root_path: true
232
+ codec_kwargs: null
233
+ codec_name: 32x32_120_16khz_original
234
+ eval_mode: true
235
+ freeze_weights: true
236
+ reconstruction:
237
+ roi_kwargs:
238
+ top_left:
239
+ - 65
240
+ - 118
241
+ height: 256
242
+ width: 256
243
+ resize_coef: 8
244
+ normalize_lensless: true
245
+ corners_list: null
246
+ psf:
247
+ slm: adafruit
248
+ sensor: rpi_hq
249
+ downsample: 8
250
+ rotate: -0.8
251
+ vertical_shift: -20
252
+ horizontal_shift: -20
253
+ flipud: true
254
+ use_waveprop: true
255
+ deadspace: true
256
+ scene2mask: 0.3
257
+ mask2sensor: 0.004
258
+ grayscale: true
259
+ lr_scheduler:
260
+ _target_: torch.optim.lr_scheduler.ConstantLR
261
+ factor: 1
262
+ optimizer:
263
+ _target_: torch.optim.Adam
264
+ lr: 0.0001
265
+ loss_function:
266
+ _target_: src.loss.ReconstructionLoss
267
+ codec_mse_coef: 1
268
+ codec_ssim_coef: 1
269
+ codec_gmsd_coef: 0
270
+ raw_codec_ssim_coef: 0
271
+ raw_codec_l1_coef: 0
272
+ audio_l1_coef: 0
273
+ audio_sisdr_coef: 0
274
+ audio_stft_coef: 0
275
+ audio_mel_coef: 0
276
+ audio_stft_config:
277
+ window_lengths:
278
+ - 2048
279
+ - 512
280
+ audio_mel_config:
281
+ n_mels:
282
+ - 5
283
+ - 10
284
+ - 20
285
+ - 40
286
+ window_lengths:
287
+ - 32
288
+ - 64
289
+ - 128
290
+ - 256
291
+ mel_fmin:
292
+ - 0
293
+ - 0
294
+ - 0
295
+ - 0
296
+ mel_fmax:
297
+ - null
298
+ - null
299
+ - null
300
+ - null
301
+ pow: 1.0
302
+ clamp_eps: 1.0e-05
303
+ mag_weight: 0.0
304
+ resize_coef: ${reconstruction.resize_coef}
305
+ ssim_kernel: 7
306
+ ssim_sigma: 0.5
307
+ raw_ssim_kernel: 11
308
+ trainer:
309
+ log_step: 50
310
+ n_epochs: 100
311
+ epoch_len: 500
312
+ device_tensors:
313
+ - lensless_codec_video
314
+ - lensed_codec_video
315
+ - lensless_psf
316
+ - audio
317
+ resume_from: null
318
+ device: auto
319
+ override: true
320
+ monitor: max test_PSNR
321
+ save_period: 5
322
+ early_stop: ${trainer.n_epochs}
323
+ save_dir: saved
324
+ seed: 1
32x32_librispeech_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M/checkpoint-epoch100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5c207fdf5ff4cc38d37d1a3a2117beba802b5f2f574920ac5808277688b281c
3
+ size 97416762
32x32_librispeech_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_librispeech_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: 3j7yioen
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.LibrispeechDataset
176
+ max_audio_length: 3
177
+ part: train-clean-100
178
+ roi_kwargs: ${reconstruction.roi_kwargs}
179
+ codec_name: ${codec.codec_name}
180
+ lensless_tag: measurement
181
+ instance_transforms: ${transforms.instance_transforms.train}
182
+ sim_psf_config: ${psf}
183
+ test:
184
+ _target_: src.datasets.LibrispeechDataset
185
+ limit: 1
186
+ max_audio_length: 3
187
+ part: test-clean
188
+ roi_kwargs: ${reconstruction.roi_kwargs}
189
+ codec_name: ${codec.codec_name}
190
+ lensless_tag: measurement
191
+ instance_transforms: ${transforms.instance_transforms.inference}
192
+ sim_psf_config: ${psf}
193
+ dataloader:
194
+ train:
195
+ _target_: torch.utils.data.DataLoader
196
+ batch_size: 1
197
+ num_workers: 2
198
+ pin_memory: true
199
+ inference:
200
+ _target_: torch.utils.data.DataLoader
201
+ batch_size: 1
202
+ num_workers: 2
203
+ pin_memory: true
204
+ transforms:
205
+ instance_transforms:
206
+ train:
207
+ all:
208
+ _target_: torchvision.transforms.v2.Compose
209
+ transforms:
210
+ - _target_: src.transforms.PadCrop
211
+ length: 4
212
+ pad_format: replicated
213
+ random_crop: true
214
+ inference: null
215
+ batch_transforms:
216
+ train: null
217
+ inference: null
218
+ codec:
219
+ _target_: src.transforms.CodecEncoderDecoder
220
+ codec_cls: ${resolve_class:dac.DAC}
221
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
222
+ codec_add_root_path: true
223
+ codec_kwargs: null
224
+ codec_name: 32x32_120_16khz_original
225
+ eval_mode: true
226
+ freeze_weights: true
227
+ reconstruction:
228
+ roi_kwargs:
229
+ top_left:
230
+ - 65
231
+ - 118
232
+ height: 256
233
+ width: 256
234
+ resize_coef: 8
235
+ normalize_lensless: true
236
+ corners_list: null
237
+ psf:
238
+ slm: adafruit
239
+ sensor: rpi_hq
240
+ downsample: 8
241
+ rotate: -0.8
242
+ vertical_shift: -20
243
+ horizontal_shift: -20
244
+ flipud: true
245
+ use_waveprop: true
246
+ deadspace: true
247
+ scene2mask: 0.3
248
+ mask2sensor: 0.004
249
+ grayscale: true
250
+ lr_scheduler:
251
+ _target_: torch.optim.lr_scheduler.ConstantLR
252
+ factor: 1
253
+ optimizer:
254
+ _target_: torch.optim.Adam
255
+ lr: 0.0001
256
+ loss_function:
257
+ _target_: src.loss.ReconstructionLoss
258
+ codec_mse_coef: 1
259
+ codec_ssim_coef: 1
260
+ codec_gmsd_coef: 0
261
+ raw_codec_ssim_coef: 1
262
+ raw_codec_l1_coef: 0
263
+ audio_l1_coef: 0
264
+ audio_sisdr_coef: 0
265
+ audio_stft_coef: 0
266
+ audio_mel_coef: 0
267
+ audio_stft_config:
268
+ window_lengths:
269
+ - 2048
270
+ - 512
271
+ audio_mel_config:
272
+ n_mels:
273
+ - 5
274
+ - 10
275
+ - 20
276
+ - 40
277
+ window_lengths:
278
+ - 32
279
+ - 64
280
+ - 128
281
+ - 256
282
+ mel_fmin:
283
+ - 0
284
+ - 0
285
+ - 0
286
+ - 0
287
+ mel_fmax:
288
+ - null
289
+ - null
290
+ - null
291
+ - null
292
+ pow: 1.0
293
+ clamp_eps: 1.0e-05
294
+ mag_weight: 0.0
295
+ resize_coef: ${reconstruction.resize_coef}
296
+ ssim_kernel: 7
297
+ ssim_sigma: 0.5
298
+ raw_ssim_kernel: 11
299
+ trainer:
300
+ log_step: 50
301
+ n_epochs: 100
302
+ epoch_len: 500
303
+ device_tensors:
304
+ - lensless_codec_video
305
+ - lensed_codec_video
306
+ - lensless_psf
307
+ - audio
308
+ resume_from: null
309
+ device: auto
310
+ override: true
311
+ monitor: max test_PSNR
312
+ save_period: 5
313
+ early_stop: ${trainer.n_epochs}
314
+ save_dir: saved
315
+ seed: 1
32x32_librispeech_mse_ssim_raw_ssim_l1_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_librispeech_mse_ssim_raw_ssim_l1_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: qsjm2uok
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.LibrispeechDataset
176
+ max_audio_length: 3
177
+ part: train-clean-100
178
+ roi_kwargs: ${reconstruction.roi_kwargs}
179
+ codec_name: ${codec.codec_name}
180
+ lensless_tag: measurement
181
+ instance_transforms: ${transforms.instance_transforms.train}
182
+ sim_psf_config: ${psf}
183
+ test:
184
+ _target_: src.datasets.LibrispeechDataset
185
+ limit: 1
186
+ max_audio_length: 3
187
+ part: test-clean
188
+ roi_kwargs: ${reconstruction.roi_kwargs}
189
+ codec_name: ${codec.codec_name}
190
+ lensless_tag: measurement
191
+ instance_transforms: ${transforms.instance_transforms.inference}
192
+ sim_psf_config: ${psf}
193
+ music_test:
194
+ _target_: src.datasets.SongDescriberDataset
195
+ limit: 1
196
+ part: test
197
+ roi_kwargs: ${reconstruction.roi_kwargs}
198
+ codec_name: ${codec.codec_name}
199
+ lensless_tag: measurement
200
+ instance_transforms: ${transforms.instance_transforms.inference}
201
+ sim_psf_config: ${psf}
202
+ dataloader:
203
+ train:
204
+ _target_: torch.utils.data.DataLoader
205
+ batch_size: 1
206
+ num_workers: 2
207
+ pin_memory: true
208
+ inference:
209
+ _target_: torch.utils.data.DataLoader
210
+ batch_size: 1
211
+ num_workers: 2
212
+ pin_memory: true
213
+ transforms:
214
+ instance_transforms:
215
+ train:
216
+ all:
217
+ _target_: torchvision.transforms.v2.Compose
218
+ transforms:
219
+ - _target_: src.transforms.PadCrop
220
+ length: 4
221
+ pad_format: replicated
222
+ random_crop: true
223
+ inference: null
224
+ batch_transforms:
225
+ train: null
226
+ inference: null
227
+ codec:
228
+ _target_: src.transforms.CodecEncoderDecoder
229
+ codec_cls: ${resolve_class:dac.DAC}
230
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
231
+ codec_add_root_path: true
232
+ codec_kwargs: null
233
+ codec_name: 32x32_120_16khz_original
234
+ eval_mode: true
235
+ freeze_weights: true
236
+ reconstruction:
237
+ roi_kwargs:
238
+ top_left:
239
+ - 65
240
+ - 118
241
+ height: 256
242
+ width: 256
243
+ resize_coef: 8
244
+ normalize_lensless: true
245
+ corners_list: null
246
+ psf:
247
+ slm: adafruit
248
+ sensor: rpi_hq
249
+ downsample: 8
250
+ rotate: -0.8
251
+ vertical_shift: -20
252
+ horizontal_shift: -20
253
+ flipud: true
254
+ use_waveprop: true
255
+ deadspace: true
256
+ scene2mask: 0.3
257
+ mask2sensor: 0.004
258
+ grayscale: true
259
+ lr_scheduler:
260
+ _target_: torch.optim.lr_scheduler.ConstantLR
261
+ factor: 1
262
+ optimizer:
263
+ _target_: torch.optim.Adam
264
+ lr: 0.0001
265
+ loss_function:
266
+ _target_: src.loss.ReconstructionLoss
267
+ codec_mse_coef: 1
268
+ codec_ssim_coef: 1
269
+ codec_gmsd_coef: 0
270
+ raw_codec_ssim_coef: 1
271
+ raw_codec_l1_coef: 0
272
+ audio_l1_coef: 1
273
+ audio_sisdr_coef: 0
274
+ audio_stft_coef: 0
275
+ audio_mel_coef: 0
276
+ audio_stft_config:
277
+ window_lengths:
278
+ - 2048
279
+ - 512
280
+ audio_mel_config:
281
+ n_mels:
282
+ - 5
283
+ - 10
284
+ - 20
285
+ - 40
286
+ window_lengths:
287
+ - 32
288
+ - 64
289
+ - 128
290
+ - 256
291
+ mel_fmin:
292
+ - 0
293
+ - 0
294
+ - 0
295
+ - 0
296
+ mel_fmax:
297
+ - null
298
+ - null
299
+ - null
300
+ - null
301
+ pow: 1.0
302
+ clamp_eps: 1.0e-05
303
+ mag_weight: 0.0
304
+ resize_coef: ${reconstruction.resize_coef}
305
+ ssim_kernel: 7
306
+ ssim_sigma: 0.5
307
+ raw_ssim_kernel: 11
308
+ trainer:
309
+ log_step: 50
310
+ n_epochs: 100
311
+ epoch_len: 500
312
+ device_tensors:
313
+ - lensless_codec_video
314
+ - lensed_codec_video
315
+ - lensless_psf
316
+ - audio
317
+ resume_from: null
318
+ device: auto
319
+ override: true
320
+ monitor: max test_PSNR
321
+ save_period: 5
322
+ early_stop: ${trainer.n_epochs}
323
+ save_dir: saved
324
+ seed: 1
32x32_librispeech_mse_ssim_raw_ssim_mel_PSF_Unet4M_U5_Unet4M/checkpoint-epoch100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1acf42d38d028f8911caa5385e1136da714f181b62d6c9e88455db8783ff547a
3
+ size 97418426
32x32_librispeech_mse_ssim_raw_ssim_mel_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_librispeech_mse_ssim_raw_ssim_mel_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: dfjjqazw
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.LibrispeechDataset
176
+ max_audio_length: 3
177
+ part: train-clean-100
178
+ roi_kwargs: ${reconstruction.roi_kwargs}
179
+ codec_name: ${codec.codec_name}
180
+ lensless_tag: measurement
181
+ instance_transforms: ${transforms.instance_transforms.train}
182
+ sim_psf_config: ${psf}
183
+ test:
184
+ _target_: src.datasets.LibrispeechDataset
185
+ limit: 1
186
+ max_audio_length: 3
187
+ part: test-clean
188
+ roi_kwargs: ${reconstruction.roi_kwargs}
189
+ codec_name: ${codec.codec_name}
190
+ lensless_tag: measurement
191
+ instance_transforms: ${transforms.instance_transforms.inference}
192
+ sim_psf_config: ${psf}
193
+ music_test:
194
+ _target_: src.datasets.SongDescriberDataset
195
+ limit: 1
196
+ part: test
197
+ roi_kwargs: ${reconstruction.roi_kwargs}
198
+ codec_name: ${codec.codec_name}
199
+ lensless_tag: measurement
200
+ instance_transforms: ${transforms.instance_transforms.inference}
201
+ sim_psf_config: ${psf}
202
+ dataloader:
203
+ train:
204
+ _target_: torch.utils.data.DataLoader
205
+ batch_size: 1
206
+ num_workers: 2
207
+ pin_memory: true
208
+ inference:
209
+ _target_: torch.utils.data.DataLoader
210
+ batch_size: 1
211
+ num_workers: 2
212
+ pin_memory: true
213
+ transforms:
214
+ instance_transforms:
215
+ train:
216
+ all:
217
+ _target_: torchvision.transforms.v2.Compose
218
+ transforms:
219
+ - _target_: src.transforms.PadCrop
220
+ length: 4
221
+ pad_format: replicated
222
+ random_crop: true
223
+ inference: null
224
+ batch_transforms:
225
+ train: null
226
+ inference: null
227
+ codec:
228
+ _target_: src.transforms.CodecEncoderDecoder
229
+ codec_cls: ${resolve_class:dac.DAC}
230
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
231
+ codec_add_root_path: true
232
+ codec_kwargs: null
233
+ codec_name: 32x32_120_16khz_original
234
+ eval_mode: true
235
+ freeze_weights: true
236
+ reconstruction:
237
+ roi_kwargs:
238
+ top_left:
239
+ - 65
240
+ - 118
241
+ height: 256
242
+ width: 256
243
+ resize_coef: 8
244
+ normalize_lensless: true
245
+ corners_list: null
246
+ psf:
247
+ slm: adafruit
248
+ sensor: rpi_hq
249
+ downsample: 8
250
+ rotate: -0.8
251
+ vertical_shift: -20
252
+ horizontal_shift: -20
253
+ flipud: true
254
+ use_waveprop: true
255
+ deadspace: true
256
+ scene2mask: 0.3
257
+ mask2sensor: 0.004
258
+ grayscale: true
259
+ lr_scheduler:
260
+ _target_: torch.optim.lr_scheduler.ConstantLR
261
+ factor: 1
262
+ optimizer:
263
+ _target_: torch.optim.Adam
264
+ lr: 0.0001
265
+ loss_function:
266
+ _target_: src.loss.ReconstructionLoss
267
+ codec_mse_coef: 1
268
+ codec_ssim_coef: 1
269
+ codec_gmsd_coef: 0
270
+ raw_codec_ssim_coef: 1
271
+ raw_codec_l1_coef: 0
272
+ audio_l1_coef: 0
273
+ audio_sisdr_coef: 0
274
+ audio_stft_coef: 0
275
+ audio_mel_coef: 0.1
276
+ audio_stft_config:
277
+ window_lengths:
278
+ - 2048
279
+ - 512
280
+ audio_mel_config:
281
+ n_mels:
282
+ - 5
283
+ - 10
284
+ - 20
285
+ - 40
286
+ window_lengths:
287
+ - 32
288
+ - 64
289
+ - 128
290
+ - 256
291
+ mel_fmin:
292
+ - 0
293
+ - 0
294
+ - 0
295
+ - 0
296
+ mel_fmax:
297
+ - null
298
+ - null
299
+ - null
300
+ - null
301
+ pow: 1.0
302
+ clamp_eps: 1.0e-05
303
+ mag_weight: 0.0
304
+ resize_coef: ${reconstruction.resize_coef}
305
+ ssim_kernel: 7
306
+ ssim_sigma: 0.5
307
+ raw_ssim_kernel: 11
308
+ trainer:
309
+ log_step: 50
310
+ n_epochs: 100
311
+ epoch_len: 500
312
+ device_tensors:
313
+ - lensless_codec_video
314
+ - lensed_codec_video
315
+ - lensless_psf
316
+ - audio
317
+ resume_from: null
318
+ device: auto
319
+ override: true
320
+ monitor: max test_PSNR
321
+ save_period: 5
322
+ early_stop: ${trainer.n_epochs}
323
+ save_dir: saved
324
+ seed: 1
32x32_random_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M/checkpoint-epoch100.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a894df0b3866eeecd507bedceefaf8a381d7644be1b5b1877321ac4b31c39ce1
3
+ size 97416570
32x32_random_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M/config.yaml ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ _target_: src.model.LenslessWrapper
3
+ use_loader: false
4
+ loader_kwargs: null
5
+ use_batch_video_version: false
6
+ freeze_weights: false
7
+ psf_path: data/digicam_psf/SIM_psf.png
8
+ psf_loader_kwargs:
9
+ downsample: 8
10
+ return_bg: false
11
+ grayscale_psf: true
12
+ recon_name: UnrolledADMM
13
+ recon_kwargs:
14
+ n_iter: 5
15
+ mu1: 0.0001
16
+ mu2: 0.0001
17
+ mu3: 0.0001
18
+ tau: 0.0002
19
+ pre_process:
20
+ _target_: lensless.recon.drunet.network_unet.UNetRes
21
+ in_nc: 2
22
+ out_nc: 1
23
+ nc:
24
+ - 32
25
+ - 64
26
+ - 112
27
+ - 128
28
+ nb: 4
29
+ act_mode: R
30
+ downsample_mode: strideconv
31
+ upsample_mode: convtranspose
32
+ post_process:
33
+ _target_: lensless.recon.drunet.network_unet.UNetRes
34
+ in_nc: 2
35
+ out_nc: 1
36
+ nc:
37
+ - 32
38
+ - 64
39
+ - 116
40
+ - 128
41
+ nb: 4
42
+ act_mode: R
43
+ downsample_mode: strideconv
44
+ upsample_mode: convtranspose
45
+ psf_network:
46
+ _target_: lensless.recon.drunet.network_unet.UNetRes
47
+ in_nc: 2
48
+ out_nc: 1
49
+ nc:
50
+ - 4
51
+ - 8
52
+ - 16
53
+ - 32
54
+ nb: 4
55
+ act_mode: R
56
+ downsample_mode: strideconv
57
+ upsample_mode: convtranspose
58
+ psf_residual: false
59
+ skip_unrolled: false
60
+ return_intermediate: false
61
+ writer:
62
+ _target_: src.logger.WandBWriter
63
+ project_name: lenslessmic
64
+ entity: null
65
+ run_name: 32x32_random_mse_ssim_raw_ssim_PSF_Unet4M_U5_Unet4M
66
+ mode: online
67
+ loss_names:
68
+ - loss
69
+ - codec_mse_loss
70
+ - codec_ssim_loss
71
+ - raw_codec_ssim_loss
72
+ - raw_codec_l1_loss
73
+ - audio_l1_loss
74
+ - audio_sisdr_loss
75
+ - audio_stft_loss
76
+ - audio_mel_loss
77
+ log_checkpoints: false
78
+ id_length: 8
79
+ names:
80
+ - input-1:frame
81
+ - input-2:frame
82
+ - input-3:frame
83
+ - input-4:frame
84
+ figsize:
85
+ - 15
86
+ - 15
87
+ sample_rate: 16000
88
+ run_id: u0zp8tee
89
+ metrics:
90
+ device: auto
91
+ train:
92
+ - _target_: src.metrics.SISDRMetric
93
+ name: SISDR
94
+ - _target_: src.metrics.PSNRMetric
95
+ name: PSNR
96
+ - _target_: src.metrics.QuantizationMatchMetric
97
+ name: QuantizationMatch-all
98
+ codebook_index: all
99
+ inference:
100
+ - _target_: src.metrics.SISDRMetric
101
+ name: SISDR
102
+ - _target_: src.metrics.STOIMetric
103
+ name: STOI
104
+ - _target_: src.metrics.WERMetric
105
+ name: WER
106
+ - _target_: src.metrics.PESQMetric
107
+ name: PESQ
108
+ - _target_: src.metrics.MelMetric
109
+ name: Mel
110
+ audio_mel_config:
111
+ n_mels:
112
+ - 5
113
+ - 10
114
+ - 20
115
+ - 40
116
+ - 80
117
+ - 160
118
+ - 320
119
+ window_lengths:
120
+ - 32
121
+ - 64
122
+ - 128
123
+ - 256
124
+ - 512
125
+ - 1024
126
+ - 2048
127
+ mel_fmin:
128
+ - 0
129
+ - 0
130
+ - 0
131
+ - 0
132
+ - 0
133
+ - 0
134
+ - 0
135
+ mel_fmax:
136
+ - null
137
+ - null
138
+ - null
139
+ - null
140
+ - null
141
+ - null
142
+ - null
143
+ pow: 1.0
144
+ clamp_eps: 1.0e-05
145
+ mag_weight: 0.0
146
+ - _target_: src.metrics.STFTMetric
147
+ name: STFT
148
+ audio_stft_config:
149
+ window_lengths:
150
+ - 2048
151
+ - 512
152
+ - _target_: src.metrics.QuantizationMatchMetric
153
+ name: QuantizationMatch-all
154
+ codebook_index: all
155
+ - _target_: src.metrics.QuantizationMatchMetric
156
+ name: QuantizationMatch-1
157
+ codebook_index: 1
158
+ - _target_: src.metrics.QuantizationMatchMetric
159
+ name: QuantizationMatch-2
160
+ codebook_index: 2
161
+ - _target_: src.metrics.PSNRMetric
162
+ name: PSNR
163
+ - _target_: src.metrics.SSIMMetric
164
+ name: SSIM
165
+ - _target_: src.metrics.GMSDMetric
166
+ name: GMSD
167
+ - _target_: src.metrics.MSEMetric
168
+ name: MSE
169
+ normalized: false
170
+ - _target_: src.metrics.MSEMetric
171
+ name: NormMSE
172
+ normalized: true
173
+ datasets:
174
+ train:
175
+ _target_: src.datasets.RandomDataset
176
+ part: train
177
+ roi_kwargs: ${reconstruction.roi_kwargs}
178
+ codec_name: ${codec.codec_name}
179
+ lensless_tag: measurement
180
+ instance_transforms: ${transforms.instance_transforms.train}
181
+ sim_psf_config: ${psf}
182
+ test:
183
+ _target_: src.datasets.LibrispeechDataset
184
+ limit: 1
185
+ max_audio_length: 3
186
+ part: test-clean
187
+ roi_kwargs: ${reconstruction.roi_kwargs}
188
+ codec_name: ${codec.codec_name}
189
+ lensless_tag: measurement
190
+ instance_transforms: ${transforms.instance_transforms.inference}
191
+ sim_psf_config: ${psf}
192
+ dataloader:
193
+ train:
194
+ _target_: torch.utils.data.DataLoader
195
+ batch_size: 1
196
+ num_workers: 2
197
+ pin_memory: true
198
+ inference:
199
+ _target_: torch.utils.data.DataLoader
200
+ batch_size: 1
201
+ num_workers: 2
202
+ pin_memory: true
203
+ transforms:
204
+ instance_transforms:
205
+ train:
206
+ all:
207
+ _target_: torchvision.transforms.v2.Compose
208
+ transforms:
209
+ - _target_: src.transforms.PadCrop
210
+ length: 4
211
+ pad_format: replicated
212
+ random_crop: true
213
+ inference: null
214
+ batch_transforms:
215
+ train: null
216
+ inference: null
217
+ codec:
218
+ _target_: src.transforms.CodecEncoderDecoder
219
+ codec_cls: ${resolve_class:dac.DAC}
220
+ codec_weights_path: data/dac_exps/${codec.codec_name}/latest/dac/weights.pth
221
+ codec_add_root_path: true
222
+ codec_kwargs: null
223
+ codec_name: 32x32_120_16khz_original
224
+ eval_mode: true
225
+ freeze_weights: true
226
+ reconstruction:
227
+ roi_kwargs:
228
+ top_left:
229
+ - 65
230
+ - 118
231
+ height: 256
232
+ width: 256
233
+ resize_coef: 8
234
+ normalize_lensless: true
235
+ corners_list: null
236
+ psf:
237
+ slm: adafruit
238
+ sensor: rpi_hq
239
+ downsample: 8
240
+ rotate: -0.8
241
+ vertical_shift: -20
242
+ horizontal_shift: -20
243
+ flipud: true
244
+ use_waveprop: true
245
+ deadspace: true
246
+ scene2mask: 0.3
247
+ mask2sensor: 0.004
248
+ grayscale: true
249
+ lr_scheduler:
250
+ _target_: torch.optim.lr_scheduler.ConstantLR
251
+ factor: 1
252
+ optimizer:
253
+ _target_: torch.optim.Adam
254
+ lr: 0.0001
255
+ loss_function:
256
+ _target_: src.loss.ReconstructionLoss
257
+ codec_mse_coef: 1
258
+ codec_ssim_coef: 1
259
+ codec_gmsd_coef: 0
260
+ raw_codec_ssim_coef: 1
261
+ raw_codec_l1_coef: 0
262
+ audio_l1_coef: 0
263
+ audio_sisdr_coef: 0
264
+ audio_stft_coef: 0
265
+ audio_mel_coef: 0
266
+ audio_stft_config:
267
+ window_lengths:
268
+ - 2048
269
+ - 512
270
+ audio_mel_config:
271
+ n_mels:
272
+ - 5
273
+ - 10
274
+ - 20
275
+ - 40
276
+ window_lengths:
277
+ - 32
278
+ - 64
279
+ - 128
280
+ - 256
281
+ mel_fmin:
282
+ - 0
283
+ - 0
284
+ - 0
285
+ - 0
286
+ mel_fmax:
287
+ - null
288
+ - null
289
+ - null
290
+ - null
291
+ pow: 1.0
292
+ clamp_eps: 1.0e-05
293
+ mag_weight: 0.0
294
+ resize_coef: ${reconstruction.resize_coef}
295
+ ssim_kernel: 7
296
+ ssim_sigma: 0.5
297
+ raw_ssim_kernel: 11
298
+ trainer:
299
+ log_step: 50
300
+ n_epochs: 100
301
+ epoch_len: 500
302
+ device_tensors:
303
+ - lensless_codec_video
304
+ - lensed_codec_video
305
+ - lensless_psf
306
+ - audio
307
+ resume_from: null
308
+ device: auto
309
+ override: true
310
+ monitor: max test_PSNR
311
+ save_period: 5
312
+ early_stop: ${trainer.n_epochs}
313
+ save_dir: saved
314
+ seed: 1