File size: 2,925 Bytes
79cf5f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import csv
import pathlib
import click
@click.command(help='Add ph_num attribute into transcriptions.csv')
@click.argument('transcription', metavar='TRANSCRIPTIONS')
@click.option('--dictionary', metavar='DICTIONARY')
@click.option('--vowels', metavar='FILE')
@click.option('--consonants', metavar='FILE')
def add_ph_num(
transcription: str,
dictionary: str = None,
vowels: str = None,
consonants: str = None
):
assert dictionary is not None or (vowels is not None and consonants is not None), \
'Either dictionary file or vowels and consonants file should be specified.'
if dictionary is not None:
dictionary = pathlib.Path(dictionary).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(dictionary, 'r', encoding='utf8') as f:
rules = f.readlines()
for r in rules:
syllable, phonemes = r.split('\t')
phonemes = phonemes.split()
assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
if len(phonemes) == 1:
vowels.add(phonemes[0])
else:
consonants.add(phonemes[0])
vowels.add(phonemes[1])
else:
vowels_path = pathlib.Path(vowels).resolve()
consonants_path = pathlib.Path(consonants).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(vowels_path, 'r', encoding='utf8') as f:
vowels.update(f.read().split())
with open(consonants_path, 'r', encoding='utf8') as f:
consonants.update(f.read().split())
overlapped = vowels.intersection(consonants)
assert len(vowels.intersection(consonants)) == 0, \
'Vowel set and consonant set overlapped. The following phonemes ' \
'appear both as vowels and as consonants:\n' \
f'{sorted(overlapped)}'
transcription = pathlib.Path(transcription).resolve()
items: list[dict] = []
with open(transcription, 'r', encoding='utf8') as f:
reader = csv.DictReader(f)
for item in reader:
items.append(item)
for item in items:
item: dict
ph_seq = item['ph_seq'].split()
for ph in ph_seq:
assert ph in vowels or ph in consonants, \
f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
ph_num = []
i = 0
while i < len(ph_seq):
j = i + 1
while j < len(ph_seq) and ph_seq[j] in consonants:
j += 1
ph_num.append(str(j - i))
i = j
item['ph_num'] = ' '.join(ph_num)
with open(transcription, 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=items[0].keys())
writer.writeheader()
writer.writerows(items)
if __name__ == '__main__':
add_ph_num()
|