Spaces:

xiaoxuezi
/

speaker_verification

Runtime error

App Files Files Community

speaker_verification / config.py

xiaoxuezi

app.py

ce7b81a over 3 years ago

raw

history blame contribute delete

11.6 kB

	class Config(object):
	def __init__(self, config_dict: dict):
	for key, val in config_dict.items():
	if val is not None:
	self.__setattr__(key, val)

	def copy(self, new_config_dict={}):
	ret = Config(vars(self))
	for key, val in new_config_dict.items():
	if val is not None:
	ret.__setattr__(key, val)
	return ret

	def replace(self, new_config_dict):
	if isinstance(new_config_dict, Config):
	new_config_dict = vars(new_config_dict)

	for key, val in new_config_dict.items():
	if val is not None:
	self.__setattr__(key, val)

	def print(self):
	for k, v in vars(self).items():
	print(k, '=', v)

	# def parser_val(self, val):
	# if isinstance(val, dict):
	# return Config(val)
	# elif isinstance(val, list):
	# for i in range(len(val)):
	# if val is not None:
	# val[i] = self.parser_val(val[i])
	# return val
	# else:
	# return val

	def __str__(self):
	return str(vars(self))


	base_config = Config({
	"project": "speaker_verification",
	"name": "VGGVox",
	"save_dir": "train_models/",
	"resume": "",

	# Training and test data
	"dataset": Config({
	"name": "voxceleb2_wav",
	"train_list": "data/train_list.txt",
	"test_list": "data/veri_list.txt",
	"train_path": "data/voxceleb2",
	"test_path": "data/voxceleb1",
	"musan_path": "data/musan_split", # 噪声文件
	"rir_path": "data/RIRS_NOISES/simulated_rirs", # 混响文件
	}),


	# Data loader
	"max_frames": 300, # 训练时帧长
	"eval_frames": 300,
	"batch_size": 64,
	"max_seg_per_spk": 500, # 每个说话人最大的语音段数
	"nDataLoaderThread": 16, # 多线程加载
	"augment": True, # 是否数据增强
	"seed": 10,
	"segment": 1,

	# Training details
	"test_interval": 1, # 测试间隔
	"max_epoch": 500,

	# Model definition
	"n_mels": 40,
	"log_input": False,
	"model": "Vgg",
	"encoder_type": "SAP",
	"nOut": 512,

	# Loss functions
	"loss": "SoftmaxProto", # lossfunction function
	"hard_prob": 0.5,
	"hard_rank": 10,
	"margin": 0.2,
	"scale": 30,
	"nPerSpeaker": 2, # 同一段语音取多少组
	"nClasses": 5994,

	# Optimizer
	"optimizer": "adam",
	"scheduler": "steplr",
	"lr": 0.001,
	"lr_decay": 0.95,
	"weight_decay": 0,

	# Evaluation parameters
	"dcf_p_target": 0.05,
	"dcf_c_miss": 1,
	"dcf_c_fa": 1,

	# eval
	"eval": False,
	})

	cfg = base_config

	vgg_cfg = Config({
	"name": "vgg_spectrogram",
	"model": "vgg",
	"batch_size": 64,
	"nPerSpeaker": 2,
	})

	Unet_cfg = Config({
	"name": "Unet",
	"model": "UNetVgg",
	"batch_size": 48,
	"nPerSpeaker": 2,
	"loss": "Unetloss"
	})

	UnetMask_cfg = Config({
	"name": "UnetMask",
	"model": "UNetVggMask",
	"batch_size": 16,
	"nPerSpeaker": 2,
	"segment": 3,
	"loss": "UnetMaskloss"
	})

	ECAPA_TDNN_cfg = Config({
	"name": "ECAPA_TDNNm",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmaxProto",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	})

	ECAPA_TDNNm_cfg = Config({
	"name": "ECAPA_TDNNm",
	"model": "ECAPA_TDNN",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	})

	ECAPA_TDNN1024_cfg = Config({
	"name": "ECAPA_TDNN1024",
	"model": "ECAPA_TDNN",
	"batch_size": 80,
	"nPerSpeaker": 2,
	"channels": 1024,
	"nOut": 192,
	})

	ECAPA_TDNN_ks5_cfg = Config({
	"name": "ECAPA_TDNN_ks5",
	"model": "ECAPA_TDNN_ks5",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	})

	ECAPA_TDNN_L2_cfg = Config({
	"name": "ECAPA_TDNN_L2_pre",
	"model": "ECAPA_TDNN_L2",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	"resume": "train_models/speaker_verification_ECAPA_TDNN/20210725/epoch:47,EER:2.5981,MinDCF:0.1912"
	})

	ECAPA_TDNN_br_cfg = Config({
	"name": "ECAPA_TDNN_br",
	"model": "ECAPA_TDNN_br",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	})

	ECAPATDNN_cfg = Config({
	"name": "ECAPATDNN",
	"model": "ECAPATDNN",
	"batch_size": 110,
	"nPerSpeaker": 2,
	"nOut": 192,
	"input_size": 80,
	})

	HRNet_cfg = Config({
	"name": "hrnet",
	"model": "hrnet",
	"max_frames": 224,
	"eval_frames": 224,
	"batch_size": 48,
	"nPerSpeaker": 2,
	"nOut": 1024,
	"input_size": 224*224,

	"model_cfg": Config({
	"hrnet_name": "w48",
	"STAGE1": {
	"NUM_MODULES": 1,
	"NUM_BRANCHES": 1,
	"BLOCK": "BOTTLENECK",
	"NUM_BLOCKS": [4],
	"NUM_CHANNELS": [64],
	"FUSE_METHOD": "SUM"
	},
	"STAGE2": {
	"NUM_MODULES": 1,
	"NUM_BRANCHES": 2,
	"BLOCK": "BASIC",
	"NUM_BLOCKS": [4, 4],
	"NUM_CHANNELS": [18, 36],
	"FUSE_METHOD": "SUM"
	},
	"STAGE3": {
	"NUM_MODULES": 4,
	"NUM_BRANCHES": 3,
	"BLOCK": "BASIC",
	"NUM_BLOCKS": [4, 4, 4],
	"NUM_CHANNELS": [18, 36, 72],
	"FUSE_METHOD": "SUM"
	},
	"STAGE4": {
	"NUM_MODULES": 3,
	"NUM_BRANCHES": 4,
	"BLOCK": "BASIC",
	"NUM_BLOCKS": [4, 4, 4, 4],
	"NUM_CHANNELS": [18, 36, 72, 144],
	"FUSE_METHOD": "SUM"
	},
	}),

	})

	VGG_TDNN_cfg = Config({
	"name": "Vggtdnn1",
	"model": "Vggtdnn",
	"batch_size": 256,
	"nOut": 512,
	"nDataLoaderThread": 16,
	})

	ResNetSE34V2_cfg = Config({
	"name": "ResNetSE34V2",
	"model": "ResNetSE34V2",
	"batch_size": 128,
	"nOut": 512,
	"nDataLoaderThread": 16,
	})

	HRTDNN_cfg = Config({
	"name": "hrtdnn",
	"model": "hrtdnn",
	"max_frames": 300,
	"eval_frames": 300,
	"batch_size": 96,
	"nPerSpeaker": 2,
	"nOut": 256,

	"model_cfg": Config({
	"hrnet_name": "hrtdnn",
	"STAGE1": {
	"NUM_BRANCHES": 1,
	"BLOCK": 'TDNNBlock',
	"NUM_CHANNELS": [128],
	"FUSE_METHOD": "SUM"
	},
	"STAGE2": {
	"NUM_BRANCHES": 2,
	"BLOCK": 'TDNNBlock',
	"NUM_CHANNELS": [128, 512],
	"FUSE_METHOD": "SUM"
	},
	"STAGE3": {
	"NUM_BRANCHES": 3,
	"BLOCK": 'TDNNBlock',
	"NUM_CHANNELS": [128, 512, 1024],
	"FUSE_METHOD": "SUM"
	},

	}),

	})

	ResTDNN_cfg = Config({
	"name": "ResTDNN",
	"model": "ResTDNN",
	"batch_size": 110,
	"nOut": 256,
	"nDataLoaderThread": 16,
	})

	TDNN_VGG_cfg = Config({
	"name": "TDNN_VGG",
	"model": "TDNN_VGG",
	"batch_size": 64,
	"nOut": 256,
	"nDataLoaderThread": 16,
	})

	ResNet_TDNN_cfg = Config({
	"name": "ResNet_TDNN",
	"model": "ResNet_TDNN",
	"batch_size": 96,
	"nOut": 192,
	"nDataLoaderThread": 16,
	})

	ResNet_TDNNa_cfg = Config({
	"name": "ResNet_TDNNa",
	"model": "ResNet_TDNN",
	"batch_size": 96,
	"nOut": 192,
	"nDataLoaderThread": 16,
	})

	ResNet_TDNNaam_cfg = Config({
	"name": "ResNet_TDNNaam",
	"model": "ResNet_TDNN",
	"loss": "AamSoftmaxProto",
	"margin": 0.2,
	"scale": 30,
	"batch_size": 96,
	"nOut": 192,
	"nDataLoaderThread": 16,
	"augment": True,
	})

	TDNN_ResNet_cfg = Config({
	"name": "TDNN_ResNet",
	"model": "TDNN_ResNet",
	"batch_size": 48,
	"nOut": 256,
	"nDataLoaderThread": 16,
	})

	hr_tdnn_cfg = Config({
	"name": "hr_tdnn",
	"model": "hr_tdnn",
	"batch_size": 46,
	"nOut": 192,
	"nDataLoaderThread": 16,
	})


	ECAPA_TDNNma_cfg = Config({
	"name": "ECAPA_TDNNma",
	"model": "ECAPA_TDNN",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	"augment": True,
	})

	ECAPA_TDNNaam_cfg = Config({
	"name": "ECAPA_TDNNaam",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,
	})

	ECAPA_TDNNaam1_cfg = Config({
	"name": "ECAPA_TDNNaam1",
	"model": "ECAPA_TDNN",
	"loss": "AdditiveAngularMargin",
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,
	})

	ECAPA_TDNNaam2_cfg = Config({
	"name": "ECAPA_TDNNaam2",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"margin": 0.2,
	"scale": 30,
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,

	})

	ECAPA_TDNNaam3_cfg = Config({
	"name": "ECAPA_TDNNaam3",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"margin": 0.1,
	"scale": 30,
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,

	})

	ECAPA_TDNN_aamproto_cfg = Config({
	"name": "ECAPA_TDNN_aamproto",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmaxProto",
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	"augment": True,
	})

	ECAPA_TDNN_aamproto1_cfg = Config({
	"name": "ECAPA_TDNN_aamproto1",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmaxProto",
	"margin": 0.2,
	"scale": 30,
	"batch_size": 180,
	"nPerSpeaker": 2,
	"nOut": 192,
	"augment": True,
	})

	ECAPA_TDNN0_cfg = Config({
	"name": "ECAPA_TDNN-1lr0.001",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"batch_size": 360,
	"nOut": 192,
	"nPerSpeaker": 1,
	"resume": "train_models/speaker_verification_ECAPA_TDNN0/20210928/epoch:25,EER:2.4125,MinDCF:0.1537",
	})

	SwinTransformer_cfg = Config({
	"name": "SwinTransformer",
	"model": "SwinTransformer",
	"loss": "SoftmaxProto",
	"max_frames": 224,
	"eval_frames": 224,
	"n_mels": 224,
	"batch_size": 90,
	"nPerSpeaker": 2,
	"nOut": 192,
	"augment": True,
	"lr": 5e-5,
	})

	ECAPA_TDNN_aampre_cfg = Config({
	"name": "ECAPA_TDNN_aampre",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmaxProto",
	"batch_size": 180,
	"nOut": 192,
	"nPerSpeaker": 2,
	"resume": "train_models/speaker_verification_ECAPA_TDNNma/20210908/epoch:67,EER:2.3224,MinDCF:0.1658",
	})

	# 更换dataloader
	ECAPA_TDNN_data_cfg = Config({
	"name": "ECAPA_TDNN_data",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,

	})

	# 标准的ECAPA_TDNN 学习率CyclicLR
	ECAPA_TDNNaam_cyclr_cfg = Config({
	"name": "ECAPA_TDNNaam_cyclr",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"margin": 0.2,
	"scale": 30,
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,

	})

	# 跟换数据加载的ResNet_TDNN只用softmax
	ResNet_TDNNaam_data_cfg = Config({
	"name": "ResNet_TDNNaam_data",
	"model": "ResNet_TDNN",
	"loss": "AamSoftmax",
	"margin": 0.2,
	"scale": 30,
	"batch_size": 192,
	"nOut": 192,
	"nDataLoaderThread": 16,
	"nPerSpeaker": 1,
	"augment": True,
	})

	# 更换dataloader, 和cyclical lr
	ECAPA_TDNN_dataClr_cfg = Config({
	"name": "ECAPA_TDNN_dataClr",
	"model": "ECAPA_TDNN",
	"loss": "AamSoftmax",
	"batch_size": 360,
	"nPerSpeaker": 1,
	"nOut": 192,
	"augment": True,
	})

	def set_cfg(config_name: str):
	""" Sets the active configs. Works even if cfg is already imported! """
	global cfg
	# Note this is not just an eval because I'm lazy, but also because it can
	# be used like ssd300_config.copy({'max_size': 400}) for extreme fine-tuning
	cfg.replace(eval(config_name))