|
|
import csv |
|
|
import pathlib |
|
|
|
|
|
import click |
|
|
|
|
|
|
|
|
@click.command(help='Add ph_num attribute into transcriptions.csv') |
|
|
@click.argument('transcription', metavar='TRANSCRIPTIONS') |
|
|
@click.option('--dictionary', metavar='DICTIONARY') |
|
|
@click.option('--vowels', metavar='FILE') |
|
|
@click.option('--consonants', metavar='FILE') |
|
|
def add_ph_num( |
|
|
transcription: str, |
|
|
dictionary: str = None, |
|
|
vowels: str = None, |
|
|
consonants: str = None |
|
|
): |
|
|
assert dictionary is not None or (vowels is not None and consonants is not None), \ |
|
|
'Either dictionary file or vowels and consonants file should be specified.' |
|
|
if dictionary is not None: |
|
|
dictionary = pathlib.Path(dictionary).resolve() |
|
|
vowels = {'SP', 'AP'} |
|
|
consonants = set() |
|
|
with open(dictionary, 'r', encoding='utf8') as f: |
|
|
rules = f.readlines() |
|
|
for r in rules: |
|
|
syllable, phonemes = r.split('\t') |
|
|
phonemes = phonemes.split() |
|
|
assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.' |
|
|
if len(phonemes) == 1: |
|
|
vowels.add(phonemes[0]) |
|
|
else: |
|
|
consonants.add(phonemes[0]) |
|
|
vowels.add(phonemes[1]) |
|
|
else: |
|
|
vowels_path = pathlib.Path(vowels).resolve() |
|
|
consonants_path = pathlib.Path(consonants).resolve() |
|
|
vowels = {'SP', 'AP'} |
|
|
consonants = set() |
|
|
with open(vowels_path, 'r', encoding='utf8') as f: |
|
|
vowels.update(f.read().split()) |
|
|
with open(consonants_path, 'r', encoding='utf8') as f: |
|
|
consonants.update(f.read().split()) |
|
|
overlapped = vowels.intersection(consonants) |
|
|
assert len(vowels.intersection(consonants)) == 0, \ |
|
|
'Vowel set and consonant set overlapped. The following phonemes ' \ |
|
|
'appear both as vowels and as consonants:\n' \ |
|
|
f'{sorted(overlapped)}' |
|
|
|
|
|
transcription = pathlib.Path(transcription).resolve() |
|
|
items: list[dict] = [] |
|
|
with open(transcription, 'r', encoding='utf8') as f: |
|
|
reader = csv.DictReader(f) |
|
|
for item in reader: |
|
|
items.append(item) |
|
|
|
|
|
for item in items: |
|
|
item: dict |
|
|
ph_seq = item['ph_seq'].split() |
|
|
for ph in ph_seq: |
|
|
assert ph in vowels or ph in consonants, \ |
|
|
f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.' |
|
|
ph_num = [] |
|
|
i = 0 |
|
|
while i < len(ph_seq): |
|
|
j = i + 1 |
|
|
while j < len(ph_seq) and ph_seq[j] in consonants: |
|
|
j += 1 |
|
|
ph_num.append(str(j - i)) |
|
|
i = j |
|
|
item['ph_num'] = ' '.join(ph_num) |
|
|
|
|
|
with open(transcription, 'w', encoding='utf8', newline='') as f: |
|
|
writer = csv.DictWriter(f, fieldnames=items[0].keys()) |
|
|
writer.writeheader() |
|
|
writer.writerows(items) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
add_ph_num() |
|
|
|