File size: 2,864 Bytes
79cf5f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pathlib

import click
import matplotlib.pyplot as plt
import tqdm

import distribution


# noinspection PyShadowingBuiltins
@click.command(help='Validate transcription labels')
@click.option('--dir', required=True, help='Path to the segments directory')
@click.option('--dictionary', required=True, help='Path to the dictionary file')
def validate_labels(dir, dictionary):
    # Load dictionary
    dict_path = pathlib.Path(dictionary)
    with open(dict_path, 'r', encoding='utf8') as f:
        rules = [ln.strip().split('\t') for ln in f.readlines()]
    dictionary = {}
    phoneme_set = set()
    for r in rules:
        phonemes = r[1].split()
        dictionary[r[0]] = phonemes
        phoneme_set.update(phonemes)

    # Run checks
    check_failed = False
    covered = set()
    phoneme_map = {}
    for ph in sorted(phoneme_set):
        phoneme_map[ph] = 0

    segments_dir = pathlib.Path(dir)
    filelist = list(segments_dir.glob('*.wav'))

    for file in tqdm.tqdm(filelist):
        filename = file.stem
        annotation = file.with_suffix('.lab')
        if not annotation.exists():
            print(f'No annotation found for \'{filename}\'!')
            check_failed = True
            continue
        with open(annotation, 'r', encoding='utf8') as f:
            syllables = f.read().strip().split()
        if not syllables:
            print(f'Annotation file \'{annotation}\' is empty!')
            check_failed = True
        else:
            oov = []
            for s in syllables:
                if s not in dictionary:
                    oov.append(s)
                else:
                    for ph in dictionary[s]:
                        phoneme_map[ph] += 1
                    covered.update(dictionary[s])
            if oov:
                print(f'Syllable(s) {oov} not allowed in annotation file \'{annotation}\'')
                check_failed = True

    # Phoneme coverage
    uncovered = phoneme_set - covered
    if uncovered:
        print(f'The following phonemes are not covered!')
        print(sorted(uncovered))
        print('Please add more recordings to cover these phonemes.')
        check_failed = True

    if not check_failed:
        print('All annotations are well prepared.')

    phoneme_list = sorted(phoneme_set)
    phoneme_counts = [phoneme_map[ph] for ph in phoneme_list]
    distribution.draw_distribution(
        title='Phoneme Distribution Summary',
        x_label='Phoneme',
        y_label='Number of occurrences',
        items=phoneme_list,
        values=phoneme_counts
    )
    phoneme_summary = segments_dir / 'phoneme_distribution.jpg'
    plt.savefig(fname=phoneme_summary,
                bbox_inches='tight',
                pad_inches=0.25)
    print(f'Phoneme distribution summary saved to {phoneme_summary}')


if __name__ == '__main__':
    validate_labels()