xuewu.lin commited on
Commit
51da6a1
·
1 Parent(s): 515272c
Files changed (4) hide show
  1. README.md +2 -0
  2. model.config.json +650 -0
  3. model.safetensors +3 -0
  4. training_log.txt +0 -0
README.md CHANGED
@@ -1,3 +1,5 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ The single-task version of robotwin "block_stack_three" uses Grounding-DINO-Tiny as the base model.
model.config.json ADDED
@@ -0,0 +1,650 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
3
+ "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
4
+ "backbone": {
5
+ "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
6
+ "embed_dims": 96,
7
+ "depths": [
8
+ 2,
9
+ 2,
10
+ 6,
11
+ 2
12
+ ],
13
+ "num_heads": [
14
+ 3,
15
+ 6,
16
+ 12,
17
+ 24
18
+ ],
19
+ "window_size": 7,
20
+ "mlp_ratio": 4,
21
+ "qkv_bias": true,
22
+ "qk_scale": null,
23
+ "drop_rate": 0.0,
24
+ "attn_drop_rate": 0.0,
25
+ "out_indices": [
26
+ 1,
27
+ 2,
28
+ 3
29
+ ],
30
+ "with_cp": true,
31
+ "convert_weights": false
32
+ },
33
+ "decoder": {
34
+ "type": "robo_orchard_lab.models.sem_modules.action_decoder:SEMActionDecoder",
35
+ "img_cross_attn": {
36
+ "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
37
+ "embed_dims": 256,
38
+ "num_heads": 8,
39
+ "max_position_embeddings": 32
40
+ },
41
+ "norm_layer": {
42
+ "type": "torch.nn.modules.normalization:RMSNorm",
43
+ "normalized_shape": 256
44
+ },
45
+ "ffn": {
46
+ "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
47
+ "embed_dims": 256,
48
+ "feedforward_channels": 2048,
49
+ "act_cfg": {
50
+ "type": "torch.nn.modules.activation:SiLU",
51
+ "inplace": true
52
+ }
53
+ },
54
+ "head": {
55
+ "type": "robo_orchard_lab.models.sem_modules.layers:UpsampleHead",
56
+ "upsample_sizes": [
57
+ 16,
58
+ 32,
59
+ 64
60
+ ],
61
+ "input_dim": 256,
62
+ "dims": [
63
+ 128,
64
+ 64,
65
+ 8
66
+ ],
67
+ "norm": {
68
+ "type": "torch.nn.modules.normalization:RMSNorm",
69
+ "normalized_shape": 256
70
+ },
71
+ "act": {
72
+ "type": "torch.nn.modules.activation:SiLU",
73
+ "inplace": true
74
+ },
75
+ "norm_act_idx": [
76
+ 0,
77
+ 1,
78
+ 2
79
+ ]
80
+ },
81
+ "training_noise_scheduler": {
82
+ "type": "diffusers.schedulers.scheduling_ddpm:DDPMScheduler",
83
+ "num_train_timesteps": 1000,
84
+ "beta_schedule": "squaredcos_cap_v2",
85
+ "prediction_type": "sample",
86
+ "clip_sample": false
87
+ },
88
+ "test_noise_scheduler": {
89
+ "type": "diffusers.schedulers.scheduling_dpmsolver_multistep:DPMSolverMultistepScheduler",
90
+ "num_train_timesteps": 1000,
91
+ "beta_schedule": "squaredcos_cap_v2",
92
+ "prediction_type": "sample"
93
+ },
94
+ "num_inference_timesteps": 10,
95
+ "joint_self_attn": {
96
+ "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
97
+ "embed_dims": 256,
98
+ "num_heads": 8
99
+ },
100
+ "temp_cross_attn": {
101
+ "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
102
+ "embed_dims": 256,
103
+ "num_heads": 8,
104
+ "max_position_embeddings": 32
105
+ },
106
+ "text_cross_attn": {
107
+ "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
108
+ "embed_dims": 256,
109
+ "num_heads": 8,
110
+ "max_position_embeddings": 256
111
+ },
112
+ "pred_steps": 64,
113
+ "timestep_norm_layer": {
114
+ "type": "robo_orchard_lab.models.sem_modules.layers:AdaRMSNorm",
115
+ "normalized_shape": 256,
116
+ "condition_dims": 256,
117
+ "zero": true
118
+ },
119
+ "operation_order": [
120
+ "t_norm",
121
+ "joint_self_attn",
122
+ "gate_msa",
123
+ "norm",
124
+ "temp_cross_attn",
125
+ "norm",
126
+ "img_cross_attn",
127
+ "norm",
128
+ null,
129
+ null,
130
+ "scale_shift",
131
+ "ffn",
132
+ "gate_mlp",
133
+ "t_norm",
134
+ "joint_self_attn",
135
+ "gate_msa",
136
+ "norm",
137
+ "temp_cross_attn",
138
+ "norm",
139
+ "img_cross_attn",
140
+ "norm",
141
+ null,
142
+ null,
143
+ "scale_shift",
144
+ "ffn",
145
+ "gate_mlp",
146
+ "t_norm",
147
+ "joint_self_attn",
148
+ "gate_msa",
149
+ "norm",
150
+ "temp_cross_attn",
151
+ "norm",
152
+ "img_cross_attn",
153
+ "norm",
154
+ null,
155
+ null,
156
+ "scale_shift",
157
+ "ffn",
158
+ "gate_mlp",
159
+ "t_norm",
160
+ "joint_self_attn",
161
+ "gate_msa",
162
+ "norm",
163
+ "temp_cross_attn",
164
+ "norm",
165
+ "img_cross_attn",
166
+ "norm",
167
+ null,
168
+ null,
169
+ "scale_shift",
170
+ "ffn",
171
+ "gate_mlp",
172
+ "t_norm",
173
+ "joint_self_attn",
174
+ "gate_msa",
175
+ "norm",
176
+ "temp_cross_attn",
177
+ "norm",
178
+ "img_cross_attn",
179
+ "norm",
180
+ null,
181
+ null,
182
+ "scale_shift",
183
+ "ffn",
184
+ "gate_mlp",
185
+ "t_norm",
186
+ "joint_self_attn",
187
+ "gate_msa",
188
+ "norm",
189
+ "temp_cross_attn",
190
+ "norm",
191
+ "img_cross_attn",
192
+ "norm",
193
+ null,
194
+ null,
195
+ "scale_shift",
196
+ "ffn",
197
+ "gate_mlp"
198
+ ],
199
+ "feature_level": [
200
+ 1,
201
+ 2
202
+ ],
203
+ "act_cfg": {
204
+ "type": "torch.nn.modules.activation:SiLU",
205
+ "inplace": true
206
+ },
207
+ "robot_encoder": {
208
+ "type": "robo_orchard_lab.models.sem_modules.robot_state_encoder:SEMRobotStateEncoder",
209
+ "embed_dims": 256,
210
+ "chunk_size": 1,
211
+ "joint_self_attn": {
212
+ "type": "robo_orchard_lab.models.sem_modules.layers:JointGraphAttention",
213
+ "embed_dims": 256,
214
+ "num_heads": 8
215
+ },
216
+ "norm_layer": {
217
+ "type": "torch.nn.modules.normalization:RMSNorm",
218
+ "normalized_shape": 256
219
+ },
220
+ "ffn": {
221
+ "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
222
+ "embed_dims": 256,
223
+ "feedforward_channels": 2048,
224
+ "act_cfg": {
225
+ "type": "torch.nn.modules.activation:SiLU",
226
+ "inplace": true
227
+ }
228
+ },
229
+ "temp_self_attn": {
230
+ "type": "robo_orchard_lab.models.sem_modules.layers:RotaryAttention",
231
+ "embed_dims": 256,
232
+ "num_heads": 8,
233
+ "max_position_embeddings": 32
234
+ },
235
+ "act_cfg": {
236
+ "type": "torch.nn.modules.activation:SiLU",
237
+ "inplace": true
238
+ },
239
+ "operation_order": [
240
+ "norm",
241
+ "joint_self_attn",
242
+ null,
243
+ null,
244
+ "norm",
245
+ "ffn",
246
+ "norm",
247
+ "joint_self_attn",
248
+ null,
249
+ null,
250
+ "norm",
251
+ "ffn",
252
+ "norm",
253
+ "joint_self_attn",
254
+ null,
255
+ null,
256
+ "norm",
257
+ "ffn",
258
+ "norm",
259
+ "joint_self_attn",
260
+ null,
261
+ null,
262
+ "norm",
263
+ "ffn",
264
+ "norm"
265
+ ],
266
+ "state_dims": 8
267
+ },
268
+ "state_loss_weights": [
269
+ [
270
+ 1.0,
271
+ 1.0,
272
+ 1.0,
273
+ 1.0,
274
+ 0.1,
275
+ 0.1,
276
+ 0.1,
277
+ 0.1
278
+ ],
279
+ [
280
+ 1.0,
281
+ 1.0,
282
+ 1.0,
283
+ 1.0,
284
+ 0.1,
285
+ 0.1,
286
+ 0.1,
287
+ 0.1
288
+ ],
289
+ [
290
+ 1.0,
291
+ 1.0,
292
+ 1.0,
293
+ 1.0,
294
+ 0.1,
295
+ 0.1,
296
+ 0.1,
297
+ 0.1
298
+ ],
299
+ [
300
+ 1.0,
301
+ 1.0,
302
+ 1.0,
303
+ 1.0,
304
+ 0.1,
305
+ 0.1,
306
+ 0.1,
307
+ 0.1
308
+ ],
309
+ [
310
+ 1.0,
311
+ 1.0,
312
+ 1.0,
313
+ 1.0,
314
+ 0.1,
315
+ 0.1,
316
+ 0.1,
317
+ 0.1
318
+ ],
319
+ [
320
+ 1.0,
321
+ 1.0,
322
+ 1.0,
323
+ 1.0,
324
+ 0.1,
325
+ 0.1,
326
+ 0.1,
327
+ 0.1
328
+ ],
329
+ [
330
+ 1.0,
331
+ 2.0,
332
+ 2.0,
333
+ 2.0,
334
+ 0.2,
335
+ 0.2,
336
+ 0.2,
337
+ 0.2
338
+ ],
339
+ [
340
+ 1.0,
341
+ 1.0,
342
+ 1.0,
343
+ 1.0,
344
+ 0.1,
345
+ 0.1,
346
+ 0.1,
347
+ 0.1
348
+ ],
349
+ [
350
+ 1.0,
351
+ 1.0,
352
+ 1.0,
353
+ 1.0,
354
+ 0.1,
355
+ 0.1,
356
+ 0.1,
357
+ 0.1
358
+ ],
359
+ [
360
+ 1.0,
361
+ 1.0,
362
+ 1.0,
363
+ 1.0,
364
+ 0.1,
365
+ 0.1,
366
+ 0.1,
367
+ 0.1
368
+ ],
369
+ [
370
+ 1.0,
371
+ 1.0,
372
+ 1.0,
373
+ 1.0,
374
+ 0.1,
375
+ 0.1,
376
+ 0.1,
377
+ 0.1
378
+ ],
379
+ [
380
+ 1.0,
381
+ 1.0,
382
+ 1.0,
383
+ 1.0,
384
+ 0.1,
385
+ 0.1,
386
+ 0.1,
387
+ 0.1
388
+ ],
389
+ [
390
+ 1.0,
391
+ 1.0,
392
+ 1.0,
393
+ 1.0,
394
+ 0.1,
395
+ 0.1,
396
+ 0.1,
397
+ 0.1
398
+ ],
399
+ [
400
+ 1.0,
401
+ 2.0,
402
+ 2.0,
403
+ 2.0,
404
+ 0.2,
405
+ 0.2,
406
+ 0.2,
407
+ 0.2
408
+ ]
409
+ ],
410
+ "fk_loss_weight": [
411
+ [
412
+ 1.0,
413
+ 1.0,
414
+ 1.0,
415
+ 1.0,
416
+ 0.1,
417
+ 0.1,
418
+ 0.1,
419
+ 0.1
420
+ ],
421
+ [
422
+ 1.0,
423
+ 1.0,
424
+ 1.0,
425
+ 1.0,
426
+ 0.1,
427
+ 0.1,
428
+ 0.1,
429
+ 0.1
430
+ ],
431
+ [
432
+ 1.0,
433
+ 1.0,
434
+ 1.0,
435
+ 1.0,
436
+ 0.1,
437
+ 0.1,
438
+ 0.1,
439
+ 0.1
440
+ ],
441
+ [
442
+ 1.0,
443
+ 1.0,
444
+ 1.0,
445
+ 1.0,
446
+ 0.1,
447
+ 0.1,
448
+ 0.1,
449
+ 0.1
450
+ ],
451
+ [
452
+ 1.0,
453
+ 1.0,
454
+ 1.0,
455
+ 1.0,
456
+ 0.1,
457
+ 0.1,
458
+ 0.1,
459
+ 0.1
460
+ ],
461
+ [
462
+ 1.0,
463
+ 1.0,
464
+ 1.0,
465
+ 1.0,
466
+ 0.1,
467
+ 0.1,
468
+ 0.1,
469
+ 0.1
470
+ ],
471
+ [
472
+ 1.0,
473
+ 2.0,
474
+ 2.0,
475
+ 2.0,
476
+ 0.2,
477
+ 0.2,
478
+ 0.2,
479
+ 0.2
480
+ ],
481
+ [
482
+ 1.0,
483
+ 1.0,
484
+ 1.0,
485
+ 1.0,
486
+ 0.1,
487
+ 0.1,
488
+ 0.1,
489
+ 0.1
490
+ ],
491
+ [
492
+ 1.0,
493
+ 1.0,
494
+ 1.0,
495
+ 1.0,
496
+ 0.1,
497
+ 0.1,
498
+ 0.1,
499
+ 0.1
500
+ ],
501
+ [
502
+ 1.0,
503
+ 1.0,
504
+ 1.0,
505
+ 1.0,
506
+ 0.1,
507
+ 0.1,
508
+ 0.1,
509
+ 0.1
510
+ ],
511
+ [
512
+ 1.0,
513
+ 1.0,
514
+ 1.0,
515
+ 1.0,
516
+ 0.1,
517
+ 0.1,
518
+ 0.1,
519
+ 0.1
520
+ ],
521
+ [
522
+ 1.0,
523
+ 1.0,
524
+ 1.0,
525
+ 1.0,
526
+ 0.1,
527
+ 0.1,
528
+ 0.1,
529
+ 0.1
530
+ ],
531
+ [
532
+ 1.0,
533
+ 1.0,
534
+ 1.0,
535
+ 1.0,
536
+ 0.1,
537
+ 0.1,
538
+ 0.1,
539
+ 0.1
540
+ ],
541
+ [
542
+ 1.0,
543
+ 2.0,
544
+ 2.0,
545
+ 2.0,
546
+ 0.2,
547
+ 0.2,
548
+ 0.2,
549
+ 0.2
550
+ ]
551
+ ],
552
+ "state_dims": 8
553
+ },
554
+ "neck": {
555
+ "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
556
+ "in_channels": [
557
+ 192,
558
+ 384,
559
+ 768
560
+ ],
561
+ "kernel_size": 1,
562
+ "out_channels": 256,
563
+ "act_cfg": null,
564
+ "bias": true,
565
+ "norm_cfg": {
566
+ "type": "torch.nn.modules.normalization:GroupNorm",
567
+ "num_groups": 32
568
+ },
569
+ "num_outs": 3
570
+ },
571
+ "text_encoder": null,
572
+ "feature_enhancer": null,
573
+ "spatial_enhancer": {
574
+ "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
575
+ "embed_dims": 256,
576
+ "feature_3d_dim": 32,
577
+ "num_depth_layers": 2,
578
+ "min_depth": 0.01,
579
+ "max_depth": 1.2,
580
+ "num_depth": 128,
581
+ "with_feature_3d": true,
582
+ "loss_depth_weight": 1.0
583
+ },
584
+ "data_preprocessor": {
585
+ "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
586
+ "mean": [
587
+ 123.675,
588
+ 116.28,
589
+ 103.53
590
+ ],
591
+ "std": [
592
+ 58.395,
593
+ 57.12,
594
+ 57.375
595
+ ],
596
+ "channel_flip": false,
597
+ "unsqueeze_depth_channel": true,
598
+ "batch_transforms": [
599
+ {
600
+ "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
601
+ "min_depth": 0.01,
602
+ "max_depth": 1.2,
603
+ "num_depth": 128,
604
+ "origin_stride": 2,
605
+ "valid_threshold": 0.5,
606
+ "stride": [
607
+ 8,
608
+ 16,
609
+ 32
610
+ ]
611
+ }
612
+ ]
613
+ },
614
+ "backbone_3d": {
615
+ "type": "robo_orchard_lab.models.modules.resnet:ResNet",
616
+ "depth": 34,
617
+ "in_channels": 1,
618
+ "base_channels": 4,
619
+ "num_stages": 4,
620
+ "out_indices": [
621
+ 1,
622
+ 2,
623
+ 3
624
+ ],
625
+ "bn_eval": true,
626
+ "with_cp": true,
627
+ "style": "pytorch"
628
+ },
629
+ "neck_3d": {
630
+ "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
631
+ "in_channels": [
632
+ 8,
633
+ 16,
634
+ 32
635
+ ],
636
+ "kernel_size": 1,
637
+ "out_channels": 32,
638
+ "act_cfg": null,
639
+ "bias": true,
640
+ "norm_cfg": {
641
+ "type": "torch.nn.modules.normalization:GroupNorm",
642
+ "num_groups": 4
643
+ },
644
+ "num_outs": 3
645
+ },
646
+ "input_2d": "imgs",
647
+ "input_3d": "depths",
648
+ "embed_dims": 256,
649
+ "pre_spatial_enhancer": false
650
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d5372e1ff59d339a7da28a6352f96dcfb3b0f4fed558f6371af8bfa88ea4e29
3
+ size 198329632
training_log.txt ADDED
The diff for this file is too large to render. See raw diff