thecollabagepatch commited on
Commit
0c76d34
·
verified ·
1 Parent(s): d17561d

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ jerry_encoded_bs128_HARD_epoch=19-step=60 filter=lfs diff=lfs merge=lfs -text
37
+ jerry_encoded_bs128_HARD_epoch=6-step=20 filter=lfs diff=lfs merge=lfs -text
38
+ jerry_encoded_bs16_HARDER_epoch=4-step=150 filter=lfs diff=lfs merge=lfs -text
39
+ jerry_encoded_bs32_HARDER_epoch=6-step=100 filter=lfs diff=lfs merge=lfs -text
40
+ jerry_encoded_bs64_epoch=14-step=100 filter=lfs diff=lfs merge=lfs -text
41
+ jerry_encoded_epoch=33-step=100 filter=lfs diff=lfs merge=lfs -text
base_model_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "diffusion_cond",
3
+ "sample_size": 524288,
4
+ "sample_rate": 44100,
5
+ "audio_channels": 2,
6
+ "model": {
7
+ "pretransform": {
8
+ "type": "autoencoder",
9
+ "iterate_batch": false,
10
+ "model_half": true,
11
+ "config": {
12
+ "encoder": {
13
+ "type": "oobleck",
14
+ "requires_grad": false,
15
+ "config": {
16
+ "in_channels": 2,
17
+ "channels": 128,
18
+ "c_mults": [1, 2, 4, 8, 16],
19
+ "strides": [2, 4, 4, 8, 8],
20
+ "latent_dim": 128,
21
+ "use_snake": true
22
+ }
23
+ },
24
+ "decoder": {
25
+ "type": "oobleck",
26
+ "config": {
27
+ "out_channels": 2,
28
+ "channels": 128,
29
+ "c_mults": [1, 2, 4, 8, 16],
30
+ "strides": [2, 4, 4, 8, 8],
31
+ "latent_dim": 64,
32
+ "use_snake": true,
33
+ "final_tanh": false
34
+ }
35
+ },
36
+ "bottleneck": { "type": "vae" },
37
+ "latent_dim": 64,
38
+ "downsampling_ratio": 2048,
39
+ "io_channels": 2
40
+ }
41
+ },
42
+ "conditioning": {
43
+ "configs": [
44
+ { "id": "prompt", "type": "t5", "config": { "t5_model_name": "t5-base", "max_length": 64 } },
45
+ { "id": "seconds_total", "type": "number", "config": { "min_val": 0, "max_val": 256 } }
46
+ ],
47
+ "cond_dim": 768
48
+ },
49
+ "diffusion": {
50
+ "cross_attention_cond_ids": ["prompt", "seconds_total"],
51
+ "global_cond_ids": ["seconds_total"],
52
+ "diffusion_objective": "rectified_flow",
53
+ "distribution_shift_options": { "min_length": 256, "max_length": 4096 },
54
+ "type": "dit",
55
+ "config": {
56
+ "io_channels": 64,
57
+ "embed_dim": 1024,
58
+ "depth": 16,
59
+ "num_heads": 8,
60
+ "cond_token_dim": 768,
61
+ "global_cond_dim": 768,
62
+ "transformer_type": "continuous_transformer",
63
+ "attn_kwargs": { "qk_norm": "ln" }
64
+ }
65
+ },
66
+ "io_channels": 64
67
+ },
68
+ "training": {
69
+ "use_ema": true,
70
+ "log_loss_info": false,
71
+ "pre_encoded": false,
72
+ "timestep_sampler": "trunc_logit_normal",
73
+ "optimizer_configs": {
74
+ "diffusion": {
75
+ "optimizer": {
76
+ "type": "AdamW",
77
+ "config": {
78
+ "lr": 5e-5,
79
+ "betas": [0.9, 0.95],
80
+ "eps": 1e-8,
81
+ "weight_decay": 0.01,
82
+ "foreach": true
83
+ }
84
+ },
85
+ "scheduler": {
86
+ "type": "InverseLR",
87
+ "config": { "inv_gamma": 500000, "power": 0.5, "warmup": 0.999 }
88
+ }
89
+ }
90
+ },
91
+ "demo": {
92
+ "demo_every": 2000,
93
+ "demo_steps": 50,
94
+ "num_demos": 8,
95
+ "demo_cond": [
96
+ {"prompt": "Amen break 174 BPM", "seconds_total": 6},
97
+ {"prompt": "People talking in a crowded cafe", "seconds_total": 10},
98
+ {"prompt": "chillhop 91 bpm", "seconds_total": 6},
99
+ {"prompt": "trap 120bpm", "seconds_total": 12},
100
+ {"prompt": "A dog barking next to a waterfall", "seconds_total": 6},
101
+ {"prompt": "Glitchy bass design, I used Serum for this", "seconds_total": 4},
102
+ {"prompt": "chillhop 132 bpm", "seconds_total": 12},
103
+ {"prompt": "Birds singing in the forest", "seconds_total": 10}
104
+ ],
105
+ "demo_cfg_scales": [1, 4, 7]
106
+ }
107
+ }
108
+ }
jerry_encoded_bs128_HARD_epoch=19-step=60 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24f224bba10e77e0dbc9589e33675d18ce63fef9d282d099fd396de21f6ca85
3
+ size 5769867602
jerry_encoded_bs128_HARD_epoch=6-step=20 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac49bf8935c0d8c64cade8c154704d78ab2368f5642d2baf6aa8eef219ea5a2
3
+ size 5769867602
jerry_encoded_bs16_HARDER_epoch=4-step=150 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f114ff0f518a23a3dcdee67c6901b8ad342920993e0d68b3888f59701aa6dc
3
+ size 5769867602
jerry_encoded_bs32_HARDER_epoch=6-step=100 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e738227f15765d4bb6fb70b34b9eb3434d0a131d1de87ede0b065e229c66da1f
3
+ size 5769867602
jerry_encoded_bs64_epoch=14-step=100 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e8689afe38c1df786348cc9f5c0b553286c8710839f6537e0d8a33026ff07a
3
+ size 5769867538
jerry_encoded_epoch=33-step=100 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef79774cf86ec8b30aa7036d7f8214985da32fafe570c8922dee720c16f9dd83
3
+ size 5769867538
jerry_un-encoded_epoch=32-step=2000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4248b6b2004ee918b7e0a40a6cc7958e2e74b50e263dabaffe868ddc1dc67dc
3
+ size 5769867538