import json from pathlib import Path from lib.utils import cmd from environment import TEST_DATA def read_json(): test_net = {"audios": []} test_meeting = {"audios": []} dev = {"audios": []} small = {"audios": []} with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f: data = json.load(f) for audio in data["audios"]: # print(audio["path"], audio["duration"]) for seg in audio["segments"]: # if "TEST_NET" in seg["subsets"]: # test_net["audios"].append(audio) # break # if "TEST_MEETING" in seg["subsets"]: # test_meeting["audios"].append(audio) # break # if "DEV" in seg["subsets"]: # dev["audios"].append(audio) # break if "S" in seg["subsets"]: small["audios"].append(audio) continue # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech_TEST_NET.json', 'w') as f: # json.dump(test_net, f, indent=4) # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_TEST_MEETING.json', 'w') as f: # json.dump(test_meeting, f, indent=4) # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_DEV.json', 'w') as f: # json.dump(dev, f, indent=4) with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f: json.dump(small, f, indent=4) def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None): """读取 wenet 数据集,返回音频路径、文本、时长, """ count = 0 with open(folder/json_file, encoding="utf-8") as f: data = json.load(f) audios = data["audios"] print(f"Total {len(audios)} samples in {json_file}") segment_sum = 0 for a in audios: print(a["path"]) segs = len(a["segments"]) if segs < 100: segment_sum += segs print("segments number:", segment_sum) # remote_path = f"/home/ubuntu/data_1/yujuan/dataset_untar/{a['path']}" # local_dir = Path(f"/Users/jeqin/work/code/TestTranslator/test_data/wenet/{a['path']}").parent # if not local_dir.exists(): # local_dir.mkdir(parents=True, exist_ok=True) # command = f"scp ubuntu@192.168.110.49:{remote_path} {local_dir}/" # cmd(command) if __name__ == "__main__": move_wenet()