lucadellalib
/

dycast

Model card Files Files and versions

dycast / config.json

lucadellalib's picture

Upload 3 files

d8a2953 verified 23 days ago

history blame contribute delete

3.97 kB

	{
	"encoder_name": "WavLM",
	"encoder_config": {
	"hidden_dims": [
	512,
	512,
	512,
	512,
	512,
	512,
	512
	],
	"kernel_sizes": [
	10,
	3,
	3,
	3,
	3,
	2,
	2
	],
	"strides": [
	5,
	2,
	2,
	2,
	2,
	2,
	2
	],
	"num_layers": 6,
	"dim": 1024,
	"ffn_dim": 4096,
	"num_heads": 16,
	"num_buckets": 320,
	"max_distance": 800,
	"max_cached_steps": 2048,
	"dropout": 0.0,
	"conv_pos": 128,
	"conv_pos_groups": 16,
	"causal": false,
	"window_size": 512,
	"lookahead_size": 3,
	"use_flex_attention": false
	},
	"compressor_name": "FocalEncoder",
	"compressor_config": {
	"input_dim": 1024,
	"output_dim": 32,
	"hidden_dims": [
	1024,
	1024,
	1024
	],
	"downscale_factors": [
	1,
	1,
	1
	],
	"focal_window": 14,
	"focal_level": 2,
	"focal_factor": 4,
	"dropout": 0.0,
	"use_post_norm": false,
	"use_layerscale": false,
	"layerscale_init": 0.0001,
	"tanhscale_init": 0.5,
	"normalize_modulator": false,
	"causal": false,
	"window_size": 512
	},
	"boundary_predictor_name": "HazardModel",
	"boundary_predictor_config": {
	"input_dim": 1024,
	"hidden_dims": [
	1024,
	1024,
	1024
	],
	"downscale_factors": [
	1,
	1,
	1
	],
	"focal_window": 14,
	"focal_level": 2,
	"focal_factor": 4,
	"dropout": 0.0,
	"use_post_norm": false,
	"use_layerscale": false,
	"layerscale_init": 0.0001,
	"tanhscale_init": 0.5,
	"normalize_modulator": false,
	"causal": false,
	"window_size": 512
	},
	"downsampler_name": "SelectLastPool",
	"downsampler_config": {},
	"quantizer_name": "ScalarSphericalQuantizer",
	"quantizer_config": {
	"dim": 32,
	"n_levels": 4
	},
	"duration_predictor_name": "NegBinModel",
	"duration_predictor_config": {
	"input_dim": 32,
	"hidden_dims": [
	1024,
	1024,
	1024
	],
	"downscale_factors": [
	1,
	1,
	1
	],
	"focal_window": 14,
	"focal_level": 2,
	"focal_factor": 4,
	"dropout": 0.0,
	"use_post_norm": false,
	"use_layerscale": false,
	"layerscale_init": 0.0001,
	"tanhscale_init": 0.5,
	"normalize_modulator": false,
	"causal": false,
	"window_size": 512,
	"min_duration": 1,
	"eps": 0.0001
	},
	"upsampler_name": "RepeatInterleaveUnpool",
	"upsampler_config": {},
	"decompressor_name": "FocalDecoder",
	"decompressor_config": {
	"input_dim": 32,
	"output_dim": 1024,
	"hidden_dims": [
	1024,
	1024,
	1024
	],
	"upscale_factors": [
	1,
	1,
	1
	],
	"focal_window": 14,
	"focal_level": 2,
	"focal_factor": 4,
	"dropout": 0.0,
	"use_post_norm": false,
	"use_layerscale": false,
	"layerscale_init": 0.0001,
	"tanhscale_init": 0.5,
	"normalize_modulator": false,
	"causal": false,
	"window_size": 512,
	"last_window_size": 512,
	"lookahead_size": 3
	},
	"decoder_name": "Vocos",
	"decoder_config": {
	"input_dim": 1024,
	"num_layers": 8,
	"dim": 512,
	"ffn_dim": 1536,
	"kernel_size": 7,
	"layerscale_init": 0.125,
	"n_fft": 1024,
	"hop_length": 320,
	"causal": false
	},
	"char_aligner_name": "MMS",
	"char_aligner_config": {
	"checkpoint": "facebook/mms-1b-all"
	},
	"retriever_name": "LatentIVF",
	"retriever_config": {
	"input_dim": 1024,
	"latent_dim": 32,
	"hidden_dims": [
	1024,
	1024,
	1024
	],
	"downscale_factors": [
	1,
	1,
	1
	],
	"focal_window": 14,
	"focal_level": 2,
	"focal_factor": 4,
	"dropout": 0.0,
	"use_post_norm": false,
	"use_layerscale": false,
	"layerscale_init": 0.0001,
	"tanhscale_init": 0.5,
	"normalize_modulator": false,
	"causal": false,
	"window_size": 512,
	"nlist": 4096,
	"nprobe": 16
	}
	}