ameerazam08's picture
Upload folder using huggingface_hub
79cf5f5 verified
import csv
import pathlib
import click
@click.command(help='Add ph_num attribute into transcriptions.csv')
@click.argument('transcription', metavar='TRANSCRIPTIONS')
@click.option('--dictionary', metavar='DICTIONARY')
@click.option('--vowels', metavar='FILE')
@click.option('--consonants', metavar='FILE')
def add_ph_num(
transcription: str,
dictionary: str = None,
vowels: str = None,
consonants: str = None
):
assert dictionary is not None or (vowels is not None and consonants is not None), \
'Either dictionary file or vowels and consonants file should be specified.'
if dictionary is not None:
dictionary = pathlib.Path(dictionary).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(dictionary, 'r', encoding='utf8') as f:
rules = f.readlines()
for r in rules:
syllable, phonemes = r.split('\t')
phonemes = phonemes.split()
assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.'
if len(phonemes) == 1:
vowels.add(phonemes[0])
else:
consonants.add(phonemes[0])
vowels.add(phonemes[1])
else:
vowels_path = pathlib.Path(vowels).resolve()
consonants_path = pathlib.Path(consonants).resolve()
vowels = {'SP', 'AP'}
consonants = set()
with open(vowels_path, 'r', encoding='utf8') as f:
vowels.update(f.read().split())
with open(consonants_path, 'r', encoding='utf8') as f:
consonants.update(f.read().split())
overlapped = vowels.intersection(consonants)
assert len(vowels.intersection(consonants)) == 0, \
'Vowel set and consonant set overlapped. The following phonemes ' \
'appear both as vowels and as consonants:\n' \
f'{sorted(overlapped)}'
transcription = pathlib.Path(transcription).resolve()
items: list[dict] = []
with open(transcription, 'r', encoding='utf8') as f:
reader = csv.DictReader(f)
for item in reader:
items.append(item)
for item in items:
item: dict
ph_seq = item['ph_seq'].split()
for ph in ph_seq:
assert ph in vowels or ph in consonants, \
f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.'
ph_num = []
i = 0
while i < len(ph_seq):
j = i + 1
while j < len(ph_seq) and ph_seq[j] in consonants:
j += 1
ph_num.append(str(j - i))
i = j
item['ph_num'] = ' '.join(ph_num)
with open(transcription, 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=items[0].keys())
writer.writeheader()
writer.writerows(items)
if __name__ == '__main__':
add_ph_num()