TestTranslator / scripts /wenet_utils.py
yujuanqin's picture
add asr test
db0d138
import json
from pathlib import Path
from lib.utils import cmd
from environment import TEST_DATA
def read_json():
test_net = {"audios": []}
test_meeting = {"audios": []}
dev = {"audios": []}
small = {"audios": []}
with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f:
data = json.load(f)
for audio in data["audios"]:
# print(audio["path"], audio["duration"])
for seg in audio["segments"]:
# if "TEST_NET" in seg["subsets"]:
# test_net["audios"].append(audio)
# break
# if "TEST_MEETING" in seg["subsets"]:
# test_meeting["audios"].append(audio)
# break
# if "DEV" in seg["subsets"]:
# dev["audios"].append(audio)
# break
if "S" in seg["subsets"]:
small["audios"].append(audio)
continue
# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech_TEST_NET.json', 'w') as f:
# json.dump(test_net, f, indent=4)
# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_TEST_MEETING.json', 'w') as f:
# json.dump(test_meeting, f, indent=4)
# with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_DEV.json', 'w') as f:
# json.dump(dev, f, indent=4)
with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f:
json.dump(small, f, indent=4)
def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
"""读取 wenet 数据集,返回音频路径、文本、时长,
"""
count = 0
with open(folder/json_file, encoding="utf-8") as f:
data = json.load(f)
audios = data["audios"]
print(f"Total {len(audios)} samples in {json_file}")
segment_sum = 0
for a in audios:
print(a["path"])
segs = len(a["segments"])
if segs < 100:
segment_sum += segs
print("segments number:", segment_sum)
# remote_path = f"/home/ubuntu/data_1/yujuan/dataset_untar/{a['path']}"
# local_dir = Path(f"/Users/jeqin/work/code/TestTranslator/test_data/wenet/{a['path']}").parent
# if not local_dir.exists():
# local_dir.mkdir(parents=True, exist_ok=True)
# command = f"scp ubuntu@192.168.110.49:{remote_path} {local_dir}/"
# cmd(command)
if __name__ == "__main__":
move_wenet()