MFA

File size: 9,755 Bytes

2f6b10b

import collections
import re
import argparse

from bs4 import BeautifulSoup as bs
from praatio import textgrid as tgio
from praatio.utilities.constants import Interval
import soundfile
from pathlib import Path


def parse_directory(
        original_directory: Path,
        benchmark_directory: Path,
        reference_directory: Path,
        training_directory: Path
):
    benchmark_directory.mkdir(parents=True, exist_ok=True)
    reference_directory.mkdir(parents=True, exist_ok=True)
    training_directory.mkdir(parents=True, exist_ok=True)
    talk_to_speaker = collections.defaultdict(set)

    with open(original_directory.joinpath('speaker_data.dat'), 'r', encoding='shift_jis') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line = line.split()
            try:
                speaker_id = int(line[0])
            except:
                continue
            talk_ids = line[-1]
            for t_id in talk_ids.split(':'):
                talk_to_speaker[t_id].add(speaker_id)

    for k,v in talk_to_speaker.items():
        if len(v) > 1:
            print(k,v)

    for transcription_path in original_directory.iterdir():
        if transcription_path.suffix != '.xml':
            continue
        talk_name = transcription_path.stem
        duration = soundfile.info(transcription_path.with_suffix('.wav')).duration
        benchmark_path = benchmark_directory.joinpath(talk_name + '.TextGrid')
        reference_path = reference_directory.joinpath(talk_name + '.TextGrid')
        training_path = training_directory.joinpath(talk_name + '.TextGrid')
        if False and benchmark_path.exists():
            continue
        print(transcription_path.name)
        benchmark_tg = tgio.Textgrid(minTimestamp=0.0, maxTimestamp=duration)
        reference_tg = tgio.Textgrid(minTimestamp=0.0, maxTimestamp=duration)
        with (open(transcription_path, 'r', encoding='utf8') as f):
            content = bs(f.read(), 'xml')
            talks = content.findAll('Talk')
            for talk in talks:
                speaker_id = 'CSJ_' + talk.attrs['SpeakerID']
                benchmark_tier = tgio.IntervalTier(speaker_id,[], minT=0.0, maxT=duration)
                reference_word_tier = tgio.IntervalTier(f"{speaker_id} - words",[], minT=0.0, maxT=duration)
                reference_phone_tier = tgio.IntervalTier(f"{speaker_id} - phones",[], minT=0.0, maxT=duration)
                alternate_speaker = None
                if len(talk_to_speaker[talk_name]) > 1:
                    alternate_speaker = [x for x in talk_to_speaker[talk_name] if x != int(talk.attrs['SpeakerID'])][0]
                    alternate_speaker = f"CSJ_{alternate_speaker}"
                    alternate_benchmark_tier = tgio.IntervalTier(alternate_speaker,[], minT=0.0, maxT=duration)
                    alternate_reference_word_tier = tgio.IntervalTier(f"{alternate_speaker} - words",[], minT=0.0, maxT=duration)
                    alternate_reference_phone_tier = tgio.IntervalTier(f"{alternate_speaker} - phones",[], minT=0.0, maxT=duration)
                utterance_intervals = talk.findAll('IPU')
                has_reference = False
                for utterance in utterance_intervals:
                    transcription = ''
                    utt_begin = float(utterance.attrs['IPUStartTime'])
                    utt_end = float(utterance.attrs['IPUEndTime'])
                    channel = 0 if utterance.attrs['Channel'] == 'L' else 1
                    long_words = utterance.findAll('LUW')
                    skip = False
                    for long_word in long_words:
                        word = ''
                        short_words = long_word.findAll('SUW')
                        word_begin = None
                        word_end = None
                        if skip:
                            break
                        for short_word in short_words:
                            if '(R' in short_word.attrs['OrthographicTranscription'] or \
                                    '(?' in short_word.attrs['OrthographicTranscription'] or \
                                    short_word.attrs['OrthographicTranscription'] in {'<FV>'}:
                                skip = True
                                break
                            word += short_word.attrs['OrthographicTranscription']
                            moras = short_word.findAll('Mora')
                            for mora in moras:
                                phonemes = mora.findAll('Phoneme')
                                for phoneme in phonemes:
                                    phones = phoneme.findAll('Phone')
                                    phone_label = phoneme.attrs["PhonemeEntity"]
                                    begin = None
                                    end = None
                                    for p in phones:
                                        if p.attrs['PhoneEndTime'] is None:
                                            continue
                                        if begin is None:
                                            begin = float(p.attrs['PhoneStartTime'])
                                        if p.attrs['PhoneEndTime'] is not None:
                                            end = float(p.attrs['PhoneEndTime'])
                                        has_reference = True
                                        if word_begin is None:
                                            word_begin = begin
                                        word_end = end
                                    if not word.startswith('(D') and begin != end and phones:
                                        if channel == 0 and alternate_speaker is not None:
                                            alternate_reference_phone_tier.insertEntry(Interval(begin, end, phone_label))
                                        else:
                                            reference_phone_tier.insertEntry(Interval(begin, end, phone_label))
                        if skip:
                            continue
                        word = word.replace(')', '')
                        if word.startswith('(D'):
                            word = "<cutoff>"
                        elif word.startswith('(?'):
                            word = "<unk>"
                        elif word.startswith('(A'):
                            word = word.split(maxsplit=1)[-1].split(';')[0].replace('．', '点')
                        while any(word.startswith(x) for x in ['(F', '(M', '(O']):
                            word = word.split(maxsplit=1)[-1]
                        word = re.sub(r'\(D (\(\?.*?\))?.*?\)', '<unk>', word)
                        word = re.sub(r'\(A ([^;]+?)?;.*?\)', r'\1', word).replace('．', '点')
                        word = re.sub(r'\([FM] (.*?)\)', r'\1', word)
                        transcription += word
                        if word_begin is None:
                            continue
                        if word == "<cutoff>":
                            if channel == 0 and alternate_speaker is not None:
                                alternate_reference_phone_tier.insertEntry(Interval(word_begin, word_end, "spn"))
                            else:
                                reference_phone_tier.insertEntry(Interval(word_begin, word_end, "spn"))
                        if channel == 0 and alternate_speaker is not None:
                            alternate_reference_word_tier.insertEntry(Interval(word_begin, word_end, word))
                        else:
                            reference_word_tier.insertEntry(Interval(word_begin, word_end, word))
                    if not skip and transcription:
                        if channel == 1 and alternate_speaker is not None:
                            alternate_benchmark_tier.insertEntry(Interval(utt_begin, utt_end, transcription))
                        else:
                            benchmark_tier.insertEntry(Interval(utt_begin, utt_end, transcription))
                benchmark_tg.addTier(benchmark_tier)
                reference_tg.addTier(reference_word_tier)
                reference_tg.addTier(reference_phone_tier)
                if alternate_speaker is not None:
                    benchmark_tg.addTier(alternate_benchmark_tier)
                    reference_tg.addTier(alternate_reference_word_tier)
                    reference_tg.addTier(alternate_reference_phone_tier)
            if has_reference:
                reference_tg.save(
                    str(reference_path),
                    "long_textgrid",
                    includeBlankSpaces=True)
                benchmark_tg.save(
                    str(benchmark_path),
                    "long_textgrid",
                    includeBlankSpaces=True)
            benchmark_tg.save(
                str(training_path),
                "long_textgrid",
                includeBlankSpaces=True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        prog='create_seoul_benchmark',
        description='Creates two directories of TextGrid files for use with MFA, '
                    'one as input with utterances (benchmark) and one for use in reference alignments (reference)')
    parser.add_argument('original_directory')
    parser.add_argument('benchmark_directory')
    parser.add_argument('reference_directory')
    parser.add_argument('training_directory')

    args = parser.parse_args()
    parse_directory(
        Path(args.original_directory),
        Path(args.benchmark_directory),
        Path(args.reference_directory),
        Path(args.training_directory),
    )