ZDisket commited on
Commit
7fa3276
·
verified ·
1 Parent(s): f68a8ef

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ brontes_00840000 filter=lfs diff=lfs merge=lfs -text
brontes_00840000 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dabee31e427555b53757c8626a6687740e6f8682a8d0461eb238e6d0d8a219ce
3
+ size 864451583
config.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for TransAudio model with ~30M parameters for 44.1kHz audio
2
+ #
3
+ # This configuration uses STFT preprocessing with U-Net architecture for audio processing
4
+ # STFT parameters are optimized for 44.1kHz audio to balance frequency and temporal resolution
5
+ # UNet parameters are tuned to achieve approximately 30M total parameters
6
+
7
+ model:
8
+ # STFT Configuration for 44.1kHz audio processing
9
+ stft_config:
10
+ n_fft: 16 # FFT size - provides good frequency resolution for 44.1kHz
11
+ hop_length: 8 # Hop size - 1/4 of n_fft for good temporal resolution
12
+ win_length: 16 # Window length - same as n_fft for standard hann window
13
+
14
+ # UNet Configuration targeting ~30M parameters
15
+ unet_config:
16
+ in_ch: null # Will be automatically calculated as n_audio_channels
17
+ out_ch: null # Will be automatically calculated as n_audio_channels
18
+ base_ch: 64 # Base channel count to keep parameter count in check
19
+ depth: 6 # Depth of U-Net (with downsampling/upsampling)
20
+ ch_mults: [2, 3, 4, 6, 6, 8]
21
+ k: 8 # Kernel size for convolutions
22
+ decoder_k: null
23
+ stride: 4
24
+ norm: 'weight' # Normalization type: group norm for stability
25
+ act: 'snake'
26
+ separable: false # Use standard convolutions rather than depthwise separable
27
+ use_deconv: true # Use transposed convolutions for upsampling
28
+ bottleneck_dilations: [1, 2, 4, 8] # Dilated convolutions in bottleneck
29
+ learnable_alpha: false # Learnable residual scaling parameter
30
+ alpha_init: 1.0 # Initial value for residual scaling
31
+ use_lstm_bottleneck: true
32
+ lstm_layers: 2
33
+ skip_layer_indexes: [-1, -2]
34
+ skip_residual_scales: [1.0, 0.1]
35
+ # iSTFT output head configuration (iSTFTNet-style synthesis)
36
+ use_istft_head: false # Enable iSTFT output head instead of direct waveform
37
+ istft_n_fft: 32 # FFT size for iSTFT synthesis
38
+ istft_hop_length: 16 # Hop length for iSTFT
39
+ istft_win_length: null # Window length (null = same as n_fft)
40
+ phase_eps: 1.0e-8 # Epsilon for safe atan2 phase recovery
41
+
42
+
43
+ # Dataset configuration
44
+ dataset:
45
+ sample_rate: 48000 # Target sample rate for audio
46
+ chunk_size: 16384 # Audio chunk size in samples (about 1.5 seconds at 44.1kHz)
47
+ mono: true # Convert to mono
48
+ normalize: true # Normalize to [-1, 1]
49
+ file_extensions: [".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg"]
50
+ cache_dir: "./audio_cache" # Directory to cache resampled files
51
+ min_samples: 12000 # Minimum number of samples required for a file to be included
52
+
53
+ # Training Configuration
54
+ training:
55
+ # Basic training parameters
56
+ batch_size: 16
57
+ num_epochs: 9999
58
+ learning_rate: 0.0001 # Keep at 1e-4 as in original
59
+ discriminator_lr_multiplier: 1.0 # Discriminator LR is 4x of G.
60
+ lr_warmup_steps: 2000 # Linear learning rate warmup over 7500 steps
61
+ adam_b1: 0.8 # Adam beta1, slightly higher for stability
62
+ adam_b2: 0.99 # Adam beta2
63
+ lr_decay: 0.999 # Learning rate decay per epoch
64
+ seed: 1234 # Random seed for reproducibility
65
+ fp16_run: false # Use mixed precision training (FP16)
66
+ bf16_run: true # Use BF16 training (mutually exclusive with fp16_run)
67
+ gradient_clip: 1.5 # Default gradient clipping value (0.0 to disable)
68
+ generator_gradient_clip: 1.0 # Generator gradient clip (defaults to gradient_clip if not set)
69
+ discriminator_gradient_clip: 4.0 # Discriminator gradient clip (defaults to gradient_clip if not set)
70
+ disc_loss_type: "hinge"
71
+
72
+
73
+ # Adversarial training parameters
74
+ use_adversarial: true # Enable adversarial training
75
+ pretrain_steps: 10000 # Number of steps to pretrain generator before adversarial training
76
+ pretrain_reset: true # Reset generator optimizer when switching to adversarial training
77
+ use_se_blocks: false # Enable Squeeze-and-Excitation blocks in discriminators
78
+ enable_mpd: true # Enable Multi-Period Discriminator
79
+ enable_msd: false # Enable Multi-Scale Discriminator
80
+ enable_mbsd: true # Enable Multi-Band Spectral Discriminators
81
+ feature_matching_weight: 1.5 # Weight for feature matching loss
82
+ disc_instance_noise_std: 0 # Gaussian noise std added to D inputs (prevents D overpowering, 0 to disable)
83
+ gen_s_weight: 1.0 # Weight for multi-scale generator loss
84
+ gen_f_weight: 1.0 # Weight for multi-period generator loss
85
+ disc_loss_weight: 1.0 # Weight for discriminator loss
86
+
87
+ # MultiBandSpec Discriminator parameters (part of unified Discriminator)
88
+ mbsd_window_lengths: [2048, 1024, 512] # Window lengths for each MBSD instance
89
+ mbsd_hop_factor: 0.25 # Hop factor as fraction of window length
90
+
91
+
92
+ # Audio loss parameters (adapted for 44.1kHz)
93
+ sampling_rate: 48000 # Updated to 44.1kHz
94
+ n_fft: 2048 # Increased for 44.1kHz audio
95
+ win_size: 2048 # Window size matching n_fft
96
+ hop_size: 512 # Hop size - 1/4 of window size
97
+ num_mels: 80 # Number of mel bands
98
+ fmin: 0.0 # Minimum frequency for mel
99
+ fmax_for_loss: 22050.0 # Maximum frequency for loss (half of sample rate)
100
+
101
+ # Mel loss weight
102
+ mel_loss_weight: 15.0
103
+
104
+ # Multi-scale mel loss parameters
105
+ use_multi_scale_mel_loss: true
106
+ multi_scale_mel_win_lengths: [512, 1024, 2048]
107
+ multi_scale_mel_n_mels: [40, 80, 128]
108
+ multi_scale_mel_hop_divisor: 4
109
+ multi_scale_mel_loss_mode: "charbonnier"
110
+ multi_scale_mel_log_eps: 0.00001
111
+ multi_scale_mel_l2_weight: 1.0
112
+ multi_scale_mel_charbonnier_eps: 0.000001
113
+ multi_scale_mel_f_min: 0.0
114
+ multi_scale_mel_f_max: null
115
+ multi_scale_mel_power: 1.0
116
+ multi_scale_mel_scale: "htk"
117
+ multi_scale_mel_norm: null
118
+ multi_scale_mel_clamp_min: null
119
+
120
+ # MR-STFT loss parameters (updated for 44.1kHz)
121
+ use_mr_stft_loss: false
122
+ mr_stft_n_ffts: [1024, 512, 256, 128] # Updated for 44.1kHz
123
+ mr_stft_hop_sizes: [256, 128, 64, 32] # Updated for 44.1kHz
124
+ mr_stft_win_sizes: [1024, 512, 256, 128] # Updated for 44.1kHz
125
+ mr_stft_use_charbonnier: true
126
+ mr_stft_charbonnier_eps: 0.000001
127
+ mr_stft_loss_weight: 1.0
128
+
129
+ # Waveform-domain loss parameters
130
+ use_waveform_loss: false # Enable direct waveform loss
131
+ waveform_loss_type: "mae" # Loss type: "mse", "mae", or "charbonnier"
132
+ waveform_loss_weight: 1.0 # Weight for waveform loss
133
+ waveform_loss_charbonnier_eps: 0.000001 # Epsilon for Charbonnier loss
134
+
135
+ # Pitch loss parameters
136
+ use_pitch_loss: true
137
+ pitch_loss_use_activation_loss: false
138
+ pitch_loss_act_weight: 0.1
139
+ pitch_loss_use_charbonnier: false
140
+ pitch_loss_charbonnier_eps: 0.000001
141
+ pitch_loss_tau: 0.7
142
+ pitch_loss_wmin: 0.15
143
+ pitch_loss_conf_clip_min: 0.05
144
+ pitch_loss_conf_clip_max: 0.95
145
+ pitch_loss_vuv_thresh: 0.5
146
+ pitch_loss_weight: 2.0
147
+ pitch_loss_model: "mir-1k_g7"
148
+ pitch_loss_step_size: 20.0
149
+
150
+
151
+
152
+ # Loss configuration (specific to STFT domain processing)
153
+ loss:
154
+ # Add different loss functions as needed
155
+ mse_weight: 1.0
156
+ recon_weight: 2.0
157
+ log_mag_weight: 1.0
158
+ cos_phase_weight: 0.5
159
+ sin_phase_weight: 0.5
160
+ # Could add other losses like:
161
+ # stft_loss_weight: 0.5
162
+ # perceptual_loss_weight: 0.1
163
+
164
+ # Data loading
165
+ num_workers: 4
166
+ shuffle: true
167
+ pin_memory: true
168
+
169
+ # Checkpointing
170
+ checkpoint_interval: 20000 # Save checkpoint every N steps
171
+ validation_interval: 5000 # Run validation every N steps
172
+ save_best_only: true # Only save checkpoints when validation loss improves
173
+
174
+ # Logging
175
+ log_interval: 100 # Log training progress every N steps
176
+ tensorboard_log_dir: "./logs/transaudio_44khz"
177
+
178
+ # Early stopping
179
+ early_stopping_patience: 10 # Stop if validation loss doesn't improve for N validations
180
+ early_stopping_min_delta: 0.001 # Minimum change to qualify as improvement
181
+
182
+ # Hardware configuration
183
+ hardware:
184
+ num_gpus: 1 # Number of GPUs to use (0 for CPU only)
185
+ cuda_visible_devices: "0" # Which GPUs to use (comma separated)
186
+
187
+ # Paths
188
+ paths:
189
+ audio_dir: "./audio_files" # Directory containing training audio
190
+ checkpoint_dir: "./checkpoints/transaudio_44khz" # Directory to save checkpoints
191
+ output_dir: "./outputs" # Directory for output files
192
+ log_dir: "./logs/transaudio_44khz" # Directory for logs
193
+
194
+ # Validation configuration
195
+ validation:
196
+ batch_size: 2 # Reduced for 44.1kHz audio processing
197
+ num_workers: 2
198
+
199
+ # Inference configuration
200
+ inference:
201
+ chunk_size: 16384 # Increased for 44.1kHz audio
202
+ overlap: 8192 # Overlap between chunks for seamless reconstruction (1/8 of chunk)
203
+ batch_size: 1 # Usually 1 for inference
204
+
205
+ # Architecture notes:
206
+ # Input channels after STFT: 3 * (2048//2 + 1) = 3 * 1025 = 3075
207
+ # With depth=5 and base_ch=96, the channel progression is:
208
+ # Encoder: 3075 -> 96 -> 192 -> 384 -> 768 -> 1536 -> 3072
209
+ # Bottleneck: 3072 -> 3072 with dilated convolutions
210
+ # Decoder: 3072 -> 1536 -> 768 -> 384 -> 192 -> 96 -> 3075
211
+ # This configuration should provide approximately 30M parameters while being more manageable
torchscript/exported_cpu.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:685c0475651cb1c67dd6d63a2349f60fe1e18bf255409c020c727c96d79a10ff
3
+ size 117855169
torchscript/exported_gpu.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b45a17f4de3cd9c9e4f1f3f31b6b52a4f079d2d2cade65790fa1171403587d79
3
+ size 117855233