File size: 2,574 Bytes
db0d138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
from pathlib import Path
from lib.utils import cmd
from environment import TEST_DATA

def read_json():

    test_net = {"audios": []}
    test_meeting = {"audios": []}
    dev = {"audios": []}
    small = {"audios": []}
    with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech.json', 'r') as f:
        data = json.load(f)
    for audio in data["audios"]:
        # print(audio["path"], audio["duration"])
        for seg in audio["segments"]:
            # if "TEST_NET" in seg["subsets"]:
            #     test_net["audios"].append(audio)
            #     break
            # if "TEST_MEETING" in seg["subsets"]:
            #     test_meeting["audios"].append(audio)
            #     break
            # if "DEV" in seg["subsets"]:
            #     dev["audios"].append(audio)
            #     break
            if "S" in seg["subsets"]:
                small["audios"].append(audio)
                continue
    
    # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/WenetSpeech_TEST_NET.json', 'w') as f:
    #     json.dump(test_net, f, indent=4)
    # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_TEST_MEETING.json', 'w') as f:
    #     json.dump(test_meeting, f, indent=4)
    # with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_DEV.json', 'w') as f:
    #     json.dump(dev, f, indent=4)
    with open('/Users/jeqin/work/code/TestTranslator/test_data/wenet/wenetWenetSpeech_SMALL.json', 'w') as f:
        json.dump(small, f, indent=4)

def move_wenet(folder: Path=TEST_DATA/"wenet", json_file="WenetSpeech_TEST_NET.json", count_limit=None):
    """读取 wenet 数据集,返回音频路径、文本、时长,
    """
    count = 0
    with open(folder/json_file, encoding="utf-8") as f:
        data = json.load(f)
        audios = data["audios"]
    print(f"Total {len(audios)} samples in {json_file}")
    segment_sum = 0
    for a in audios:
        print(a["path"])
        segs = len(a["segments"])
        if segs < 100:
            segment_sum += segs
        print("segments number:", segment_sum)
        # remote_path = f"/home/ubuntu/data_1/yujuan/dataset_untar/{a['path']}"
        # local_dir = Path(f"/Users/jeqin/work/code/TestTranslator/test_data/wenet/{a['path']}").parent
        # if not local_dir.exists():
        #     local_dir.mkdir(parents=True, exist_ok=True)
        # command = f"scp ubuntu@192.168.110.49:{remote_path} {local_dir}/"
        # cmd(command)

if __name__ == "__main__":
    move_wenet()