# Copyright (C) 2025. Huawei Technologies Co., Ltd. All Rights Reserved. (authors: Daxin Tan, # Xiao Chen) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from simple_tokenizer_infer import SpeechTokenizer import argparse import librosa import logging from pathlib import Path def main(args): ref_wav_file_list = [] line_info_list = [] reconstruct_wav_file_list = [] logging.info(f"loading eval file list") base_path = Path(args.input_list).parent with open(args.input_list, "r") as input_file: for line in input_file: fields = line.strip().split("|") if args.input_type == "tts": ref_wav_file_list.append(fields[2]) else: reconstruct_wav_file_list.append(fields[4]) ref_wav_file_list.append(fields[2]) line_info_list.append([fields[2], fields[0], fields[3]]) # ref wav path, gen wav id, text logging.info(f"loading ref audio") raw_ref_wavs_list = [] # 用librosa 加载后的raw wave 波形数据 for file_path in ref_wav_file_list: # 加载波形数据 raw_wav, sr = librosa.load( (base_path / file_path), sr=16000 ) # sr=None 保留原始采样率 raw_ref_wavs_list.append(raw_wav) logging.info(f"extracting token for ref audio") if args.ckpt is not None: tokenizer = SpeechTokenizer( ckpt_path=args.ckpt, cfg_path=args.cfg_path, cfg_name=args.cfg_name ) else: tokenizer = SpeechTokenizer() ref_token_list, ref_token_info_list = tokenizer.extract(raw_ref_wavs_list) if args.input_type == "reconstruct": logging.info(f"loading reconstruct audio") raw_reconstruct_wav_list = [] # 用librosa 加载后的raw wave 波形数据 for file_path in reconstruct_wav_file_list: # 加载波形数据 raw_wav, sr = librosa.load( (base_path / file_path), sr=16000 ) # sr=None 保留原始采样率 raw_reconstruct_wav_list.append(raw_wav) logging.info(f"extracting token for reconstruct audio") recon_token_list, recon_token_info_list = tokenizer.extract(raw_reconstruct_wav_list) assert(len(ref_token_info_list) == len(recon_token_info_list)) assert(len(ref_token_info_list) == len(line_info_list)) with open(args.output_file, "w") as output_file: logging.info(f"writing output file") if args.input_type == "tts": for ref, line_info in zip(ref_token_info_list, line_info_list): ref_units = ref["reduced_unit_sequence"] # logging.info(ref_units) ref_path = str((base_path / line_info[0])) output_file.write(f"{ref_path}|{ref_units}|{line_info[1]}|{line_info[2]}\n") else: for ref, recon, line_info in zip(ref_token_info_list, recon_token_info_list, line_info_list): ref_units = ref["reduced_unit_sequence"] recon_units = recon["reduced_unit_sequence"] # logging.info(ref_units) ref_path = str((base_path / line_info[0])) output_file.write(f"{ref_path}|{ref_units}|{line_info[1]}|{recon_units}|{line_info[2]}\n") output_file.close() logging.info("Finished") return if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--ckpt", dest="ckpt", required=False, help="path to ckpt", ) parser.add_argument( "--cfg-path", dest="cfg_path", required=False, default="config", help="path to config", ) parser.add_argument( "--cfg-name", dest="cfg_name", required=False, default="hubert_config", help="name of config", ) parser.add_argument( "--input-list", dest="input_list", required=True, help="list of input wavform", ) parser.add_argument( "--output-file", dest="output_file", required=True, help="file to output speech tokens", ) parser.add_argument( "--input-type", default="tts", type=str, required=True, help=f"test fil list type: tts or reconstruct, seedtts format", ) args = parser.parse_args() if args.input_type not in {"tts", "reconstruct"}: logging.info(f"Input type must be tts or reconstruct") exit() main(args)