Spaces:
No application file
No application file
| # Warning: This config is developing, and subject to change. | |
| _base_ = [ | |
| "./_base_/archs/diff_svc_v2.py", | |
| "./_base_/trainers/base.py", | |
| "./_base_/schedulers/warmup_cosine.py", | |
| "./_base_/datasets/audio_folder.py", | |
| ] | |
| phonemes = [ | |
| "AP", | |
| "SP", | |
| "E", | |
| "En", | |
| "a", | |
| "ai", | |
| "an", | |
| "ang", | |
| "ao", | |
| "b", | |
| "c", | |
| "ch", | |
| "d", | |
| "e", | |
| "ei", | |
| "en", | |
| "eng", | |
| "er", | |
| "f", | |
| "g", | |
| "h", | |
| "i", | |
| "i0", | |
| "ia", | |
| "ian", | |
| "iang", | |
| "iao", | |
| "ie", | |
| "in", | |
| "ing", | |
| "iong", | |
| "ir", | |
| "iu", | |
| "j", | |
| "k", | |
| "l", | |
| "m", | |
| "n", | |
| "o", | |
| "ong", | |
| "ou", | |
| "p", | |
| "q", | |
| "r", | |
| "s", | |
| "sh", | |
| "t", | |
| "u", | |
| "ua", | |
| "uai", | |
| "uan", | |
| "uang", | |
| "ui", | |
| "un", | |
| "uo", | |
| "v", | |
| "van", | |
| "ve", | |
| "vn", | |
| "w", | |
| "x", | |
| "y", | |
| "z", | |
| "zh", | |
| ] | |
| preprocessing = dict( | |
| text_features_extractor=dict( | |
| type="OpenCpopTranscriptionToPhonemesDuration", | |
| phonemes=phonemes, | |
| transcription_path="dataset/transcriptions.txt", | |
| ), | |
| pitch_extractor=dict( | |
| type="ParselMouthPitchExtractor", | |
| ), | |
| ) | |
| model = dict( | |
| type="DiffSinger", | |
| text_encoder=dict( | |
| _delete_=True, | |
| type="NaiveProjectionEncoder", | |
| input_size=len(phonemes) * 2 + 2, | |
| output_size=256, | |
| ), | |
| diffusion=dict( | |
| max_beta=0.02, | |
| ), | |
| ) | |
| dataset = dict( | |
| _delete_=True, | |
| train=dict( | |
| type="AudioFolderDataset", | |
| path="dataset/diff-singer/train", | |
| speaker_id=0, | |
| ), | |
| valid=dict( | |
| type="AudioFolderDataset", | |
| path="dataset/diff-singer/valid", | |
| speaker_id=0, | |
| ), | |
| ) | |