File size: 5,120 Bytes
2f6b10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import collections
import os
import re
import shutil
import subprocess
import argparse
from pathlib import Path
from praatio.textgrid import openTextgrid, Textgrid, IntervalTier

korean_root = '/mnt/d/Data/speech/korean_corpora'

seoul_corpus = os.path.join(korean_root, 'seoul_corpus')
original_textgrids = os.path.join(seoul_corpus, 'original')
training_textgrids = os.path.join(seoul_corpus, 'seoul_corpus_benchmark')
reference_dir = os.path.join(seoul_corpus, 'seoul_reference_alignments')


speaker_info = {}

def fix_sample_rate(benchmark_directory: Path):

    for speaker in os.listdir(benchmark_directory):
        speaker_dir = os.path.join(benchmark_directory, speaker)
        if not os.path.isdir(speaker_dir):
            continue
        for file in os.listdir(speaker_dir):
            if 'resampled' in file:
                os.remove(file)
            if file.endswith('.flac') and 'resampled' not in file:
                path = os.path.join(speaker_dir, file)
                resampled_file = path.replace('.flac', '_resampled.flac')
                subprocess.check_call(['sox', path, '-r', '16000', resampled_file])
                os.remove(path)
                os.rename(resampled_file, path)


def fix_textgrids(original_directory: Path, benchmark_directory: Path, reference_directory: Path):

    for file in os.listdir(original_directory):
        if os.path.isdir(os.path.join(original_directory, file)):
            continue
        speaker = file[:3]
        speaker_gender = file[3]
        speaker_age = file[4:6]
        print(speaker, speaker_gender, speaker_age)
        speaker_info[speaker] = {'gender': speaker_gender, 'age':speaker_age}
        speaker_directory = os.path.join(benchmark_directory, speaker)
        reference_speaker_directory = os.path.join(reference_directory, speaker)
        os.makedirs(speaker_directory, exist_ok=True)
        os.makedirs(reference_speaker_directory, exist_ok=True)
        if not file.endswith('.TextGrid'):
            shutil.move(os.path.join(original_directory, file), os.path.join(speaker_directory, file))
            continue
        tg_path = os.path.join(original_directory, file)
        tg = openTextgrid(tg_path, includeEmptyIntervals=False, duplicateNamesMode='rename')
        orth = tg._tierDict['utt.prono.']
        new_intervals = []
        new_tg_path = os.path.join(speaker_directory, file)
        reference_tg_path = os.path.join(reference_speaker_directory, file)
        new_tg = Textgrid(minTimestamp=tg.minTimestamp, maxTimestamp=tg.maxTimestamp)
        word_intervals = []
        for interval in orth._entries:
            if re.match(r'^<[^>]+>$', interval.label):
                continue
            text = interval.label.replace('<VOCNOISE>', '')
            if not text:
                continue
            begin = max(interval.start - 0.2, 0)
            end = min(interval.end + 0.2, orth.maxTimestamp)
            if new_intervals and begin <= new_intervals[-1][1]:
                new_intervals[-1][1] = end
                new_intervals[-1][2] += ' ' + text
            else:
                new_intervals.append([begin, end, text])
            word_intervals.append((interval.start, interval.end, text))
        tier = IntervalTier(speaker, new_intervals,minT=tg.minTimestamp, maxT=tg.maxTimestamp)
        new_tg.addTier(tier)
        new_tg.save(new_tg_path,includeBlankSpaces=True, format='short_textgrid')
        reference_tg = Textgrid(minTimestamp=tg.minTimestamp, maxTimestamp=tg.maxTimestamp)
        phone_intervals = []
        for interval in tg._tierDict['phoneme']._entries:
            if interval.label in {'<IVER>', '<SIL>', '<VOCNOISE>', '<NOISE>'}:
                continue
            text = interval.label
            if text.startswith('<'):
                text = 'spn'
            phone_intervals.append((interval.start, interval.end, text))
        word_tier = IntervalTier('words', word_intervals, minT=tg.minTimestamp, maxT=tg.maxTimestamp)
        phone_tier = IntervalTier('phones', phone_intervals, minT=tg.minTimestamp, maxT=tg.maxTimestamp)
        reference_tg.addTier(word_tier)
        reference_tg.addTier(phone_tier)
        reference_tg.save(reference_tg_path, format='short_textgrid', includeBlankSpaces=True)


def parse_directory(original_directory: Path, benchmark_directory: Path, reference_directory: Path):
    # fix_sample_rate(benchmark_directory)
    fix_textgrids(original_directory, benchmark_directory, reference_directory)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        prog='create_seoul_benchmark',
        description='Creates two directories of TextGrid files for use with MFA, '
                    'one as input with utterances (benchmark) and one for use in reference alignments (reference)')
    parser.add_argument('original_directory')
    parser.add_argument('benchmark_directory')
    parser.add_argument('reference_directory')

    args = parser.parse_args()
    parse_directory(
        Path(args.original_directory),
        Path(args.benchmark_directory),
        Path(args.reference_directory)
    )