import os import csv import wave import re import json from pathlib import Path import subprocess from subprocess import CompletedProcess def add_text_index(): text_file = '../test_data/recordings/text/test_asr_zh.txt' index = 1 with open(text_file, encoding='utf-8') as f: for line in f: line = line.strip() # print(line) if not line: continue if line.startswith('#'): # print(line) continue line = f"{index}. {line}" print(line) index += 1 def get_lines_with_index(filepath): with open(filepath, encoding='utf-8') as f: for line in f: line = line.strip() m = re.match(r'^(\d+)\.\s*(.*)', line) if m: yield m.group(1), m.group(2) def get_wav_length(wav_path): try: with wave.open(wav_path, 'rb') as wf: frames = wf.getnframes() rate = wf.getframerate() duration = frames / float(rate) return duration except Exception as e: print(f"Error reading {wav_path}: {e}") return 0 def write_csv(rows, output_csv): with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(['序号', '文本', '音频长度(秒)']) writer.writerows(rows) def print_text_and_audio_length(): text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' audio_folder = '../test_data/recordings' output_csv = 'csv/text_audio_length.csv' rows = [] for idx, text in get_lines_with_index(text_file): # print(idx) # print(text) audio_path = os.path.join(audio_folder, f"{idx}.wav") audio_length = get_wav_length(audio_path) audio_length = round(audio_length, 2) if audio_length is not None else None # print(audio_length) rows.append([idx, text, round(audio_length,2)]) write_csv(rows, output_csv) def get_text_distance(text1, text2): from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff text1_clean = clean_text_for_comparison_zh(text1) text2_clean = clean_text_for_comparison_zh(text2) d, nd = run_textdistance(text1_clean, text2_clean) diff = highlight_diff(text1_clean, text2_clean, spliter="") return d, nd, diff def get_origin_text_dict(): text_file = '../test_data/recordings/text/test_asr_zh_with_index.txt' text_dict = {} for idx, text in get_lines_with_index(text_file): text_dict[idx] = text return text_dict if __name__ == '__main__': # add_text_index() print_text_and_audio_length() # pass