File size: 4,645 Bytes
af11ce4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/bin/bash

# This script is an example of evaluate TTS models with objective metrics reported in ZipVoice paper.

# Add project root to PYTHONPATH
export PYTHONPATH=../../:$PYTHONPATH

# Set bash to 'debug' mode, it will exit on:
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail

stage=1
stop_stage=7

download_dir=download/

# Uncomment this line to use HF mirror
# export HF_ENDPOINT=https://hf-mirror.com

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
      echo "Stage 1: Download test sets (LibriSpeech-PC and Seed-TTS)"

      hf_repo=k2-fsa/TTS_eval_datasets
      mkdir -p ${download_dir}/
      for file in librispeech_pc_testset.tar.gz seedtts_testset.tar.gz; do
            echo "Downloading ${file}..."
            huggingface-cli download \
                  --repo-type dataset \
                  --local-dir ${download_dir}/ \
                  ${hf_repo} \
                  ${file}
            echo "Extracting ${file}..."
            tar -xzf ${download_dir}/${file} -C ${download_dir}/
      done
fi


if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
      echo "Stage 2: Download all required evaluation models"
      hf_repo=k2-fsa/TTS_eval_models
      mkdir -p ${download_dir}/tts_eval_models
      huggingface-cli download \
        --local-dir ${download_dir}/tts_eval_models \
        ${hf_repo}
fi


if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
      echo "Stage 3: Inference with the pre-trained ZipVoice model from huggingface"

      for testset in librispeech_pc seedtts_en seedtts_zh; do 

        if [ "$testset" = "librispeech_pc" ]; then
                test_tsv=${download_dir}/librispeech_pc_testset/test.tsv
        
        elif [ "$testset" = "seedtts_en" ]; then
                test_tsv=${download_dir}/seedtts_testset/en/test.tsv
        elif [ "$testset" = "seedtts_zh" ]; then
                test_tsv=${download_dir}/seedtts_testset/zh/test.tsv
        else
                echo "Error: unknown testset ${testset}" >&2
                exit 1
        fi
        echo "Inference on tetset ${testset}..."
        python3 -m zipvoice.bin.infer_zipvoice \
                --model-name zipvoice \
                --test-list ${test_tsv} \
                --res-dir results/${testset}
      done
fi



if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
      echo "Stage 4: Evaluation on LibriSpeech-PC"
      model_path=${download_dir}/tts_eval_models
      wav_path=results/librispeech_pc
      test_tsv=${download_dir}/librispeech_pc_testset/test.tsv
      # Use LibriSpeech style transcripts for WER evaluation
      transcript_tsv=${download_dir}/librispeech_pc_testset/transcript.tsv

      python3 -m zipvoice.eval.speaker_similarity.sim \
            --wav-path ${wav_path} \
            --test-list ${test_tsv} \
            --model-dir ${model_path} 

      python3 -m zipvoice.eval.wer.hubert \
            --wav-path ${wav_path} \
            --test-list ${transcript_tsv} \
            --model-dir ${model_path} 

      python3 -m zipvoice.eval.mos.utmos \
            --wav-path ${wav_path} \
            --model-dir ${model_path} 
fi


if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
      echo "Stage 5: Evaluation on Seed-TTS test en"
      model_path=${download_dir}/tts_eval_models
      wav_path=results/seedtts_en
      test_tsv=${download_dir}/seedtts_testset/en/test.tsv

      python3 -m zipvoice.eval.speaker_similarity.sim \
            --wav-path ${wav_path} \
            --test-list ${test_tsv} \
            --model-dir ${model_path} 

      python3 -m zipvoice.eval.wer.seedtts \
            --wav-path ${wav_path} \
            --test-list ${test_tsv} \
            --model-dir ${model_path} \
            --lang en

      python3 -m zipvoice.eval.mos.utmos \
            --wav-path ${wav_path} \
            --model-dir ${model_path} 
fi

if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
      echo "Stage 6: Evaluation on Seed-TTS test en"
      model_path=${download_dir}/tts_eval_models
      wav_path=results/seedtts_zh
      test_tsv=${download_dir}/seedtts_testset/zh/test.tsv

      python3 -m zipvoice.eval.speaker_similarity.sim \
            --wav-path ${wav_path} \
            --test-list ${test_tsv} \
            --model-dir ${model_path} 

      python3 -m zipvoice.eval.wer.seedtts \
            --wav-path ${wav_path} \
            --test-list ${test_tsv} \
            --model-dir ${model_path} \
            --lang zh

      python3 -m zipvoice.eval.mos.utmos \
            --wav-path ${wav_path} \
            --model-dir ${model_path} 
fi