Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- brontes_00840000 +3 -0
- config.yaml +211 -0
- torchscript/exported_cpu.pt +3 -0
- torchscript/exported_gpu.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
brontes_00840000 filter=lfs diff=lfs merge=lfs -text
|
brontes_00840000
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dabee31e427555b53757c8626a6687740e6f8682a8d0461eb238e6d0d8a219ce
|
| 3 |
+
size 864451583
|
config.yaml
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for TransAudio model with ~30M parameters for 44.1kHz audio
|
| 2 |
+
#
|
| 3 |
+
# This configuration uses STFT preprocessing with U-Net architecture for audio processing
|
| 4 |
+
# STFT parameters are optimized for 44.1kHz audio to balance frequency and temporal resolution
|
| 5 |
+
# UNet parameters are tuned to achieve approximately 30M total parameters
|
| 6 |
+
|
| 7 |
+
model:
|
| 8 |
+
# STFT Configuration for 44.1kHz audio processing
|
| 9 |
+
stft_config:
|
| 10 |
+
n_fft: 16 # FFT size - provides good frequency resolution for 44.1kHz
|
| 11 |
+
hop_length: 8 # Hop size - 1/4 of n_fft for good temporal resolution
|
| 12 |
+
win_length: 16 # Window length - same as n_fft for standard hann window
|
| 13 |
+
|
| 14 |
+
# UNet Configuration targeting ~30M parameters
|
| 15 |
+
unet_config:
|
| 16 |
+
in_ch: null # Will be automatically calculated as n_audio_channels
|
| 17 |
+
out_ch: null # Will be automatically calculated as n_audio_channels
|
| 18 |
+
base_ch: 64 # Base channel count to keep parameter count in check
|
| 19 |
+
depth: 6 # Depth of U-Net (with downsampling/upsampling)
|
| 20 |
+
ch_mults: [2, 3, 4, 6, 6, 8]
|
| 21 |
+
k: 8 # Kernel size for convolutions
|
| 22 |
+
decoder_k: null
|
| 23 |
+
stride: 4
|
| 24 |
+
norm: 'weight' # Normalization type: group norm for stability
|
| 25 |
+
act: 'snake'
|
| 26 |
+
separable: false # Use standard convolutions rather than depthwise separable
|
| 27 |
+
use_deconv: true # Use transposed convolutions for upsampling
|
| 28 |
+
bottleneck_dilations: [1, 2, 4, 8] # Dilated convolutions in bottleneck
|
| 29 |
+
learnable_alpha: false # Learnable residual scaling parameter
|
| 30 |
+
alpha_init: 1.0 # Initial value for residual scaling
|
| 31 |
+
use_lstm_bottleneck: true
|
| 32 |
+
lstm_layers: 2
|
| 33 |
+
skip_layer_indexes: [-1, -2]
|
| 34 |
+
skip_residual_scales: [1.0, 0.1]
|
| 35 |
+
# iSTFT output head configuration (iSTFTNet-style synthesis)
|
| 36 |
+
use_istft_head: false # Enable iSTFT output head instead of direct waveform
|
| 37 |
+
istft_n_fft: 32 # FFT size for iSTFT synthesis
|
| 38 |
+
istft_hop_length: 16 # Hop length for iSTFT
|
| 39 |
+
istft_win_length: null # Window length (null = same as n_fft)
|
| 40 |
+
phase_eps: 1.0e-8 # Epsilon for safe atan2 phase recovery
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Dataset configuration
|
| 44 |
+
dataset:
|
| 45 |
+
sample_rate: 48000 # Target sample rate for audio
|
| 46 |
+
chunk_size: 16384 # Audio chunk size in samples (about 1.5 seconds at 44.1kHz)
|
| 47 |
+
mono: true # Convert to mono
|
| 48 |
+
normalize: true # Normalize to [-1, 1]
|
| 49 |
+
file_extensions: [".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg"]
|
| 50 |
+
cache_dir: "./audio_cache" # Directory to cache resampled files
|
| 51 |
+
min_samples: 12000 # Minimum number of samples required for a file to be included
|
| 52 |
+
|
| 53 |
+
# Training Configuration
|
| 54 |
+
training:
|
| 55 |
+
# Basic training parameters
|
| 56 |
+
batch_size: 16
|
| 57 |
+
num_epochs: 9999
|
| 58 |
+
learning_rate: 0.0001 # Keep at 1e-4 as in original
|
| 59 |
+
discriminator_lr_multiplier: 1.0 # Discriminator LR is 4x of G.
|
| 60 |
+
lr_warmup_steps: 2000 # Linear learning rate warmup over 7500 steps
|
| 61 |
+
adam_b1: 0.8 # Adam beta1, slightly higher for stability
|
| 62 |
+
adam_b2: 0.99 # Adam beta2
|
| 63 |
+
lr_decay: 0.999 # Learning rate decay per epoch
|
| 64 |
+
seed: 1234 # Random seed for reproducibility
|
| 65 |
+
fp16_run: false # Use mixed precision training (FP16)
|
| 66 |
+
bf16_run: true # Use BF16 training (mutually exclusive with fp16_run)
|
| 67 |
+
gradient_clip: 1.5 # Default gradient clipping value (0.0 to disable)
|
| 68 |
+
generator_gradient_clip: 1.0 # Generator gradient clip (defaults to gradient_clip if not set)
|
| 69 |
+
discriminator_gradient_clip: 4.0 # Discriminator gradient clip (defaults to gradient_clip if not set)
|
| 70 |
+
disc_loss_type: "hinge"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# Adversarial training parameters
|
| 74 |
+
use_adversarial: true # Enable adversarial training
|
| 75 |
+
pretrain_steps: 10000 # Number of steps to pretrain generator before adversarial training
|
| 76 |
+
pretrain_reset: true # Reset generator optimizer when switching to adversarial training
|
| 77 |
+
use_se_blocks: false # Enable Squeeze-and-Excitation blocks in discriminators
|
| 78 |
+
enable_mpd: true # Enable Multi-Period Discriminator
|
| 79 |
+
enable_msd: false # Enable Multi-Scale Discriminator
|
| 80 |
+
enable_mbsd: true # Enable Multi-Band Spectral Discriminators
|
| 81 |
+
feature_matching_weight: 1.5 # Weight for feature matching loss
|
| 82 |
+
disc_instance_noise_std: 0 # Gaussian noise std added to D inputs (prevents D overpowering, 0 to disable)
|
| 83 |
+
gen_s_weight: 1.0 # Weight for multi-scale generator loss
|
| 84 |
+
gen_f_weight: 1.0 # Weight for multi-period generator loss
|
| 85 |
+
disc_loss_weight: 1.0 # Weight for discriminator loss
|
| 86 |
+
|
| 87 |
+
# MultiBandSpec Discriminator parameters (part of unified Discriminator)
|
| 88 |
+
mbsd_window_lengths: [2048, 1024, 512] # Window lengths for each MBSD instance
|
| 89 |
+
mbsd_hop_factor: 0.25 # Hop factor as fraction of window length
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# Audio loss parameters (adapted for 44.1kHz)
|
| 93 |
+
sampling_rate: 48000 # Updated to 44.1kHz
|
| 94 |
+
n_fft: 2048 # Increased for 44.1kHz audio
|
| 95 |
+
win_size: 2048 # Window size matching n_fft
|
| 96 |
+
hop_size: 512 # Hop size - 1/4 of window size
|
| 97 |
+
num_mels: 80 # Number of mel bands
|
| 98 |
+
fmin: 0.0 # Minimum frequency for mel
|
| 99 |
+
fmax_for_loss: 22050.0 # Maximum frequency for loss (half of sample rate)
|
| 100 |
+
|
| 101 |
+
# Mel loss weight
|
| 102 |
+
mel_loss_weight: 15.0
|
| 103 |
+
|
| 104 |
+
# Multi-scale mel loss parameters
|
| 105 |
+
use_multi_scale_mel_loss: true
|
| 106 |
+
multi_scale_mel_win_lengths: [512, 1024, 2048]
|
| 107 |
+
multi_scale_mel_n_mels: [40, 80, 128]
|
| 108 |
+
multi_scale_mel_hop_divisor: 4
|
| 109 |
+
multi_scale_mel_loss_mode: "charbonnier"
|
| 110 |
+
multi_scale_mel_log_eps: 0.00001
|
| 111 |
+
multi_scale_mel_l2_weight: 1.0
|
| 112 |
+
multi_scale_mel_charbonnier_eps: 0.000001
|
| 113 |
+
multi_scale_mel_f_min: 0.0
|
| 114 |
+
multi_scale_mel_f_max: null
|
| 115 |
+
multi_scale_mel_power: 1.0
|
| 116 |
+
multi_scale_mel_scale: "htk"
|
| 117 |
+
multi_scale_mel_norm: null
|
| 118 |
+
multi_scale_mel_clamp_min: null
|
| 119 |
+
|
| 120 |
+
# MR-STFT loss parameters (updated for 44.1kHz)
|
| 121 |
+
use_mr_stft_loss: false
|
| 122 |
+
mr_stft_n_ffts: [1024, 512, 256, 128] # Updated for 44.1kHz
|
| 123 |
+
mr_stft_hop_sizes: [256, 128, 64, 32] # Updated for 44.1kHz
|
| 124 |
+
mr_stft_win_sizes: [1024, 512, 256, 128] # Updated for 44.1kHz
|
| 125 |
+
mr_stft_use_charbonnier: true
|
| 126 |
+
mr_stft_charbonnier_eps: 0.000001
|
| 127 |
+
mr_stft_loss_weight: 1.0
|
| 128 |
+
|
| 129 |
+
# Waveform-domain loss parameters
|
| 130 |
+
use_waveform_loss: false # Enable direct waveform loss
|
| 131 |
+
waveform_loss_type: "mae" # Loss type: "mse", "mae", or "charbonnier"
|
| 132 |
+
waveform_loss_weight: 1.0 # Weight for waveform loss
|
| 133 |
+
waveform_loss_charbonnier_eps: 0.000001 # Epsilon for Charbonnier loss
|
| 134 |
+
|
| 135 |
+
# Pitch loss parameters
|
| 136 |
+
use_pitch_loss: true
|
| 137 |
+
pitch_loss_use_activation_loss: false
|
| 138 |
+
pitch_loss_act_weight: 0.1
|
| 139 |
+
pitch_loss_use_charbonnier: false
|
| 140 |
+
pitch_loss_charbonnier_eps: 0.000001
|
| 141 |
+
pitch_loss_tau: 0.7
|
| 142 |
+
pitch_loss_wmin: 0.15
|
| 143 |
+
pitch_loss_conf_clip_min: 0.05
|
| 144 |
+
pitch_loss_conf_clip_max: 0.95
|
| 145 |
+
pitch_loss_vuv_thresh: 0.5
|
| 146 |
+
pitch_loss_weight: 2.0
|
| 147 |
+
pitch_loss_model: "mir-1k_g7"
|
| 148 |
+
pitch_loss_step_size: 20.0
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# Loss configuration (specific to STFT domain processing)
|
| 153 |
+
loss:
|
| 154 |
+
# Add different loss functions as needed
|
| 155 |
+
mse_weight: 1.0
|
| 156 |
+
recon_weight: 2.0
|
| 157 |
+
log_mag_weight: 1.0
|
| 158 |
+
cos_phase_weight: 0.5
|
| 159 |
+
sin_phase_weight: 0.5
|
| 160 |
+
# Could add other losses like:
|
| 161 |
+
# stft_loss_weight: 0.5
|
| 162 |
+
# perceptual_loss_weight: 0.1
|
| 163 |
+
|
| 164 |
+
# Data loading
|
| 165 |
+
num_workers: 4
|
| 166 |
+
shuffle: true
|
| 167 |
+
pin_memory: true
|
| 168 |
+
|
| 169 |
+
# Checkpointing
|
| 170 |
+
checkpoint_interval: 20000 # Save checkpoint every N steps
|
| 171 |
+
validation_interval: 5000 # Run validation every N steps
|
| 172 |
+
save_best_only: true # Only save checkpoints when validation loss improves
|
| 173 |
+
|
| 174 |
+
# Logging
|
| 175 |
+
log_interval: 100 # Log training progress every N steps
|
| 176 |
+
tensorboard_log_dir: "./logs/transaudio_44khz"
|
| 177 |
+
|
| 178 |
+
# Early stopping
|
| 179 |
+
early_stopping_patience: 10 # Stop if validation loss doesn't improve for N validations
|
| 180 |
+
early_stopping_min_delta: 0.001 # Minimum change to qualify as improvement
|
| 181 |
+
|
| 182 |
+
# Hardware configuration
|
| 183 |
+
hardware:
|
| 184 |
+
num_gpus: 1 # Number of GPUs to use (0 for CPU only)
|
| 185 |
+
cuda_visible_devices: "0" # Which GPUs to use (comma separated)
|
| 186 |
+
|
| 187 |
+
# Paths
|
| 188 |
+
paths:
|
| 189 |
+
audio_dir: "./audio_files" # Directory containing training audio
|
| 190 |
+
checkpoint_dir: "./checkpoints/transaudio_44khz" # Directory to save checkpoints
|
| 191 |
+
output_dir: "./outputs" # Directory for output files
|
| 192 |
+
log_dir: "./logs/transaudio_44khz" # Directory for logs
|
| 193 |
+
|
| 194 |
+
# Validation configuration
|
| 195 |
+
validation:
|
| 196 |
+
batch_size: 2 # Reduced for 44.1kHz audio processing
|
| 197 |
+
num_workers: 2
|
| 198 |
+
|
| 199 |
+
# Inference configuration
|
| 200 |
+
inference:
|
| 201 |
+
chunk_size: 16384 # Increased for 44.1kHz audio
|
| 202 |
+
overlap: 8192 # Overlap between chunks for seamless reconstruction (1/8 of chunk)
|
| 203 |
+
batch_size: 1 # Usually 1 for inference
|
| 204 |
+
|
| 205 |
+
# Architecture notes:
|
| 206 |
+
# Input channels after STFT: 3 * (2048//2 + 1) = 3 * 1025 = 3075
|
| 207 |
+
# With depth=5 and base_ch=96, the channel progression is:
|
| 208 |
+
# Encoder: 3075 -> 96 -> 192 -> 384 -> 768 -> 1536 -> 3072
|
| 209 |
+
# Bottleneck: 3072 -> 3072 with dilated convolutions
|
| 210 |
+
# Decoder: 3072 -> 1536 -> 768 -> 384 -> 192 -> 96 -> 3075
|
| 211 |
+
# This configuration should provide approximately 30M parameters while being more manageable
|
torchscript/exported_cpu.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:685c0475651cb1c67dd6d63a2349f60fe1e18bf255409c020c727c96d79a10ff
|
| 3 |
+
size 117855169
|
torchscript/exported_gpu.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b45a17f4de3cd9c9e4f1f3f31b6b52a4f079d2d2cade65790fa1171403587d79
|
| 3 |
+
size 117855233
|