Spaces:

Obiang
/

Pro-TeVA

Sleeping

App Files Files Community

Pro-TeVA / inference.yaml

Obiang

first commit

62cb0ac 4 months ago

raw

history blame contribute delete

3.21 kB

	# ################################
	# ProTeVa Inference Configuration
	# Simplified YAML for deployment
	# ################################

	# Basic settings
	seed: 200
	device: cpu # Change to cuda if GPU available
	sample_rate: 16000

	# Output neurons (4 classes: blank, high, low, mid)
	# Based on labelencoder.txt: 0=blank, 1=H, 2=B, 3=M
	# Space (4) is added via post-processing
	output_neurons: 4
	blank_index: 0

	# Number of prototypes
	n_prototypes: 10

	# Feature dimension from HuBERT
	emb_dim: 768

	# Encoder settings
	rnn_layers: 2
	rnn_neurons: 512

	# Decoder settings
	dnn_blocks: 2
	dnn_neurons: 512

	# Pitch decoder settings
	dec_dnn_blocks: [1]
	dec_dnn_neurons: [128]

	# Activation function
	activation: !name:torch.nn.LeakyReLU

	# ============ MODULES ============

	# HuBERT feature extractor
	wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
	source: "Orange/SSA-HuBERT-base-60k"
	output_norm: True
	freeze: False
	save_path: whubert_checkpoint

	# F0 extractor (requires custom module)
	f0Compute: !new:modules.F0Extractor
	device: !ref <device>
	sample_rate: !ref <sample_rate>

	# BiGRU Encoder
	enc: !new:speechbrain.nnet.RNN.GRU
	input_shape: [null, null, !ref <emb_dim>]
	hidden_size: !ref <rnn_neurons>
	num_layers: !ref <rnn_layers>
	bidirectional: True
	dropout: 0.15

	# VanillaNN Decoder
	dec: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
	input_shape: [null, null, 1024] # 512 * 2 (bidirectional)
	activation: !ref <activation>
	dnn_blocks: !ref <dnn_blocks>
	dnn_neurons: !ref <dnn_neurons>

	# Pitch Decoder (requires custom module)
	pitch_dec: !new:modules.PitchDecoderLayer
	input_shape: [null, null, !ref <dnn_neurons>]
	dnn_blocks: !ref <dec_dnn_blocks>
	dnn_neurons: !ref <dec_dnn_neurons>

	# Prototype Layer (requires custom module)
	proto: !new:modules.PrototypeLayer
	n_prototypes: !ref <n_prototypes>
	latent_dims: !ref <dnn_neurons>

	# Output linear layer
	output_lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <n_prototypes>
	n_neurons: !ref <output_neurons>
	bias: True

	# Log softmax
	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: True

	# Label encoder
	label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder

	# ============ MODULES DICT ============

	modules:
	wav2vec2: !ref <wav2vec2>
	enc: !ref <enc>
	dec: !ref <dec>
	pitch_dec: !ref <pitch_dec>
	proto: !ref <proto>
	output_lin: !ref <output_lin>

	# Model container for all modules
	model: !new:torch.nn.ModuleList
	- [!ref <enc>, !ref <dec>, !ref <proto>, !ref <output_lin>, !ref <pitch_dec>]

	# ============ PRETRAINER ============
	# This loads the trained checkpoints

	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	model: !ref <model>
	wav2vec2: !ref <wav2vec2>
	tokenizer: !ref <label_encoder>
	paths:
	model: !ref <save_folder>/model.ckpt
	wav2vec2: !ref <save_folder>/wav2vec2.ckpt
	tokenizer: !ref <save_folder>/tokenizer.ckpt

	# Save folder - Path is loaded from config.py
	# To change checkpoint folder, update CHECKPOINT_FOLDER in config.py
	save_folder: ./CKPT+2025-10-20+08-19-07+00