omniaudio commited on
Commit
b8beb11
·
verified ·
1 Parent(s): e921b7c

Upload v2a_foa.json

Browse files
Files changed (1) hide show
  1. v2a_foa.json +120 -0
v2a_foa.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 441000,
4
+ "sample_rate": 44100,
5
+ "audio_channels": 4,
6
+ "model": {
7
+ "pretransform": {
8
+ "type": "autoencoder",
9
+ "iterate_batch": true,
10
+ "config": {
11
+ "encoder": {
12
+ "type": "oobleck",
13
+ "config": {
14
+ "in_channels": 4,
15
+ "channels": 128,
16
+ "c_mults": [1, 2, 4, 8, 16],
17
+ "strides": [2, 4, 4, 8, 8],
18
+ "latent_dim": 128,
19
+ "use_snake": true
20
+ }
21
+ },
22
+ "decoder": {
23
+ "type": "oobleck",
24
+ "config": {
25
+ "out_channels": 4,
26
+ "channels": 128,
27
+ "c_mults": [1, 2, 4, 8, 16],
28
+ "strides": [2, 4, 4, 8, 8],
29
+ "latent_dim": 64,
30
+ "use_snake": true,
31
+ "final_tanh": false
32
+ }
33
+ },
34
+ "bottleneck": {
35
+ "type": "vae"
36
+ },
37
+ "latent_dim": 64,
38
+ "downsampling_ratio": 2048,
39
+ "io_channels": 4
40
+ }
41
+ },
42
+ "conditioning": {
43
+ "configs": [
44
+ {
45
+ "id": "video_fov",
46
+ "type": "video_linear",
47
+ "config": {
48
+ "dim": 1024,
49
+ "output_dim": 1536
50
+ }
51
+ },
52
+ {
53
+ "id": "video_360",
54
+ "type": "video_global",
55
+ "config": {
56
+ "dim": 1024,
57
+ "output_dim": 1536
58
+ }
59
+ }
60
+ ],
61
+ "cond_dim": 768
62
+ },
63
+ "diffusion": {
64
+ "global_cond_ids": ["video_360"],
65
+ "add_cond_ids": ["video_fov"],
66
+ "type": "dit",
67
+ "diffusion_objective": "rectified_flow",
68
+ "config": {
69
+ "io_channels": 64,
70
+ "embed_dim": 1536,
71
+ "depth": 24,
72
+ "num_heads": 24,
73
+ "cond_token_dim": 768,
74
+ "global_cond_dim": 1536,
75
+ "project_cond_tokens": false,
76
+ "transformer_type": "continuous_transformer"
77
+ }
78
+ },
79
+ "io_channels": 64
80
+ },
81
+ "training": {
82
+ "use_ema": true,
83
+ "log_loss_info": false,
84
+ "cfg_dropout_prob": 0.2,
85
+ "optimizer_configs": {
86
+ "diffusion": {
87
+ "optimizer": {
88
+ "type": "AdamW",
89
+ "config": {
90
+ "lr": 5e-5,
91
+ "betas": [0.9, 0.999],
92
+ "weight_decay": 1e-3
93
+ }
94
+ },
95
+ "scheduler": {
96
+ "type": "InverseLR",
97
+ "config": {
98
+ "inv_gamma": 1000000,
99
+ "power": 0.5,
100
+ "warmup": 0.99
101
+ }
102
+ }
103
+ }
104
+ },
105
+ "demo": {
106
+ "demo_every": 2000,
107
+ "demo_steps": 64,
108
+ "num_demos": 6,
109
+ "demo_cond": [
110
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/dS7Ffvs2Evgl_4.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/dS7Ffvs2Evgl_4.npy"},
111
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/G8pABGosD38l_17.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/G8pABGosD38l_17.npy"},
112
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/NdE7uYVaynQl_0.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/NdE7uYVaynQl_0.npy"},
113
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/dS7Ffvs2Evgl_4.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/dS7Ffvs2Evgl_4.npy"},
114
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/tAiVUt5vE34l_30.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/tAiVUt5vE34l_30.npy"},
115
+ {"video_360": "dataset/foa/demos/metaclip-huge-eq/0B7ds6NmVBQl_80.npy", "video_fov": "dataset/foa/demos/metaclip-huge-front/0B7ds6NmVBQl_80.npy"}
116
+ ],
117
+ "demo_cfg_scales": [3,6,9]
118
+ }
119
+ }
120
+ }