|
|
import json |
|
|
from pathlib import Path |
|
|
from lib.utils import cmd |
|
|
from environment import TEST_DATA |
|
|
|
|
|
def read_json(): |
|
|
|
|
|
test_net = {"audios": []} |
|
|
test_meeting = {"audios": []} |
|
|
dev = {"audios": []} |
|
|
small = {"audios": []} |
|
|
with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f: |
|
|
data = json.load(f) |
|
|
for audio in data["audios"]: |
|
|
|
|
|
for seg in audio["segments"]: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "S" in seg["subsets"]: |
|
|
small["audios"].append(audio) |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f: |
|
|
json.dump(small, f, indent=4) |
|
|
|
|
|
def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None): |
|
|
"""读取 wenet 数据集,返回音频路径、文本、时长, |
|
|
""" |
|
|
count = 0 |
|
|
with open(folder/json_file, encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
audios = data["audios"] |
|
|
print(f"Total {len(audios)} samples in {json_file}") |
|
|
segment_sum = 0 |
|
|
for a in audios: |
|
|
print(a["path"]) |
|
|
segs = len(a["segments"]) |
|
|
if segs < 100: |
|
|
segment_sum += segs |
|
|
print("segments number:", segment_sum) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
move_wenet() |
|
|
|