# Copyright (C) 2025. Huawei Technologies Co., Ltd. All Rights Reserved. (authors: Xiao Chen)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import scipy
import argparse
from whisper_normalizer.english import EnglishTextNormalizer
import os
import string
import lingvo.tasks.asr.tools.simple_wer_v2 as WER
from tqdm import tqdm
import logging
import torch
keyphrases = None
english_normalizer = EnglishTextNormalizer()
device = torch.device("cuda")
en_asr_model_path = "./whisper-large-v3"
wer_obj = WER.SimpleWER(
key_phrases=keyphrases,
html_handler=WER.HighlightAlignedHtmlHandler(WER.HighlightAlignedHtml),
preprocess_handler=WER.RemoveCommentTxtPreprocess,
)
def dummy_split_text(text):
return text
def remove_punct(text):
puncts = set(string.punctuation)
output = ""
for char in text:
if char not in puncts:
output += char
output = output.replace(" ", " ")
return output
def get_gt_ref_texts_and_wav_files(
args, gt_test_lst, gt_folder, punct_remover, text_spliter
):
wav_file_list = []
reference = []
with open(gt_test_lst, "r") as fp:
for line in fp:
fields = line.strip().split("|")
wav_file = f"{gt_folder}/{fields[0]}.wav"
if not os.path.isfile(wav_file):
continue
wav_file_list.append(wav_file)
text = fields[-1].lower()
if args.norm_text:
truth_text = english_normalizer(text) # " ".join(fields[-1])
elif args.remove_punct:
truth_text = punct_remover(text)
else:
truth_text = text
truth_text = text_spliter(truth_text)
reference.append([truth_text, fields[-1]])
assert len(reference) == len(wav_file_list)
return reference, wav_file_list
def get_ref_texts_and_gen_files(
args, test_lst, test_folder, punct_remover, text_spliter
):
reference = []
gen_file_list = []
with open(test_lst, "r") as fp:
for line in fp:
fields = line.strip().split("|")
filename = fields[2].split("/")[-1]
filename = filename.split(".")[0]
gen_file = f"{filename}_gen.wav"
gen_file_list.append(f"{test_folder}/{gen_file}")
text = fields[-1].lower()
if args.norm_text:
truth_text = english_normalizer(text) # " ".join(fields[-1])
elif args.remove_punct:
truth_text = punct_remover(text)
else:
truth_text = text
truth_text = text_spliter(truth_text)
reference.append([truth_text, fields[-1]])
assert len(reference) == len(gen_file_list)
return reference, gen_file_list
def get_hypo_texts(args, results_list, punct_remover, text_spliter):
hypothesis = []
for res in results_list:
text = res["text"].lower()
if args.norm_text:
hypo_text = english_normalizer(text)
elif args.remove_punct:
hypo_text = punct_remover(text)
else:
hypo_text = text
hypo_text = text_spliter(hypo_text)
hypothesis.append([hypo_text, res["text"]])
return hypothesis
def calc_wer(reference, hypothesis, test_lst):
logging.info(f"calc WER:")
for idx in tqdm(range(len(hypothesis))):
hypo = hypothesis[idx][0].strip()
ref = reference[idx][0].strip()
wer_obj.AddHypRef(hypo, ref)
str_summary, str_details, str_keyphrases_info = wer_obj.GetSummaries()
logging.info(f"WER summary:")
logging.info(str_summary)
logging.info(str_details)
logging.info(str_keyphrases_info)
try:
fn_output = test_lst + "_diagnosis.html"
aligned_html = "
".join(wer_obj.aligned_htmls)
with open(fn_output, "wt") as fp:
fp.write("