jhcodec commited on
Commit
9863b52
·
1 Parent(s): fa31caf

add config and fix readme

Browse files
Files changed (2) hide show
  1. README.md +2 -1
  2. config.json +120 -0
README.md CHANGED
@@ -1,7 +1,8 @@
1
  ---
2
  license: mit
 
3
  ---
4
- # Model Card for SW2V
5
 
6
  *Reconstruct! Don't Encode: Self-Supervised Representation Reconstruction Loss for High-Intelligibility and Low-Latency Streaming Neural Audio Codec*
7
 
 
1
  ---
2
  license: mit
3
+ pipeline_tag: audio-classification
4
  ---
5
+ # Model Card for SW2V (60k)
6
 
7
  *Reconstruct! Don't Encode: Self-Supervised Representation Reconstruction Loss for High-Intelligibility and Low-Latency Streaming Neural Audio Codec*
8
 
config.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "codec": {
3
+ "mlp_in":{
4
+ "in_features": 320,
5
+ "hidden_features": 768,
6
+ "out_features": 1024,
7
+ "compute_dtype": "float32"
8
+ },
9
+ "mlp_out":{
10
+ "in_features": 1024,
11
+ "hidden_features": 768,
12
+ "out_features": 320,
13
+ "compute_dtype": "float32"
14
+ },
15
+ "decoder": {
16
+ "n_layers": 8,
17
+ "n_embd": 1024,
18
+ "n_hidden": 4096,
19
+ "n_heads": 16,
20
+ "head_dim": 64,
21
+ "compute_dtype": "float32",
22
+ "window_size": 15,
23
+ "dropout_rate": 0.1
24
+ },
25
+ "encoder": {
26
+ "n_layers": 8,
27
+ "n_embd": 1024,
28
+ "n_hidden": 4096,
29
+ "n_heads": 16,
30
+ "head_dim": 64,
31
+ "compute_dtype": "float32",
32
+ "window_size": 15,
33
+ "dropout_rate": 0.1
34
+ },
35
+ "rvq": {
36
+ "num_codebooks": 8,
37
+ "codebook_size": 1024,
38
+ "embedding_dim": 16,
39
+ "latent_dim": 16,
40
+ "updown_linears": false,
41
+ "codebook_weight_dtype": "float32"
42
+ }
43
+ },
44
+ "w2v":{
45
+ "mlp_in":{
46
+ "in_features": 320,
47
+ "hidden_features": 768,
48
+ "out_features": 1024,
49
+ "compute_dtype": "float32"
50
+ },
51
+ "encoder": {
52
+ "n_layers": 8,
53
+ "n_embd": 1024,
54
+ "n_hidden": 4096,
55
+ "n_heads": 16,
56
+ "head_dim": 64,
57
+ "compute_dtype": "float32",
58
+ "window_size": 15,
59
+ "dropout_rate": 0.1
60
+ },
61
+ "rvq": {
62
+ "num_codebooks": 8,
63
+ "codebook_size": 1024,
64
+ "embedding_dim": 1024,
65
+ "latent_dim": 1024,
66
+ "updown_linears": false,
67
+ "codebook_weight_dtype": "float32"
68
+ },
69
+ "training": {
70
+ "noise_masking": 0.1,
71
+ "noise_augmentation": 0.1
72
+ }
73
+ },
74
+ "training":{
75
+ "resume": false,
76
+ "loss_type": "cossim",
77
+ "strict_model": true,
78
+ "load_discriminator": false,
79
+ "learning_rate": 1e-4,
80
+ "weight_decay": 1e-2,
81
+ "discriminator_start_steps": 100,
82
+ "discriminator_segment_duration": 1.28,
83
+ "apply_apa": true,
84
+ "warmup_steps": 1000,
85
+ "min_lr": 1e-6,
86
+ "num_epochs": 100000,
87
+ "use_continuous": 0.1,
88
+ "max_grad_norm": 1000.0,
89
+ "batch_size": 300,
90
+ "gradient_accumulation_steps": 1,
91
+ "num_workers": 6,
92
+ "use_phaseaug": true,
93
+ "init_dataset": false,
94
+ "profile": false,
95
+ "verbose_grad_norm": false,
96
+ "verbose_norm_threshold_max": 5.0,
97
+ "verbose_norm_threshold_min": 0.001,
98
+ "verbose_paramter_norm": false,
99
+ "use_discriminator": false,
100
+ "codebook_reset_interval": 1000
101
+ },
102
+ "loss":{
103
+ "recon_loss_weight": 1
104
+ },
105
+ "data": {
106
+ "audio_dir": "/data",
107
+ "sample_rate": 16000,
108
+ "segment_duration": 10.24,
109
+ "cache_dir": "/data/dataloader/v9"
110
+ },
111
+ "logging": {
112
+ "log_interval": 100,
113
+ "save_interval": 500,
114
+ "eval_interval": 3000,
115
+ "experiment_dir": "/data/jhcodec/sw2v/{experiment_name}",
116
+ "checkpoint_dir": "/data/jhcodec/sw2v/{experiment_name}/checkpoints",
117
+ "tensorboard_dir": "/data/jhcodec/sw2v/{experiment_name}/tensorboard",
118
+ "n_samples": 3
119
+ }
120
+ }