MFA / scripts /alignment_benchmarks /data_prep /create_csj_benchmark.py

MFA

2f6b10b verified 3 months ago

9.76 kB

	import collections
	import re
	import argparse

	from bs4 import BeautifulSoup as bs
	from praatio import textgrid as tgio
	from praatio.utilities.constants import Interval
	import soundfile
	from pathlib import Path


	def parse_directory(
	original_directory: Path,
	benchmark_directory: Path,
	reference_directory: Path,
	training_directory: Path
	):
	benchmark_directory.mkdir(parents=True, exist_ok=True)
	reference_directory.mkdir(parents=True, exist_ok=True)
	training_directory.mkdir(parents=True, exist_ok=True)
	talk_to_speaker = collections.defaultdict(set)

	with open(original_directory.joinpath('speaker_data.dat'), 'r', encoding='shift_jis') as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	line = line.split()
	try:
	speaker_id = int(line[0])
	except:
	continue
	talk_ids = line[-1]
	for t_id in talk_ids.split(':'):
	talk_to_speaker[t_id].add(speaker_id)

	for k,v in talk_to_speaker.items():
	if len(v) > 1:
	print(k,v)

	for transcription_path in original_directory.iterdir():
	if transcription_path.suffix != '.xml':
	continue
	talk_name = transcription_path.stem
	duration = soundfile.info(transcription_path.with_suffix('.wav')).duration
	benchmark_path = benchmark_directory.joinpath(talk_name + '.TextGrid')
	reference_path = reference_directory.joinpath(talk_name + '.TextGrid')
	training_path = training_directory.joinpath(talk_name + '.TextGrid')
	if False and benchmark_path.exists():
	continue
	print(transcription_path.name)
	benchmark_tg = tgio.Textgrid(minTimestamp=0.0, maxTimestamp=duration)
	reference_tg = tgio.Textgrid(minTimestamp=0.0, maxTimestamp=duration)
	with (open(transcription_path, 'r', encoding='utf8') as f):
	content = bs(f.read(), 'xml')
	talks = content.findAll('Talk')
	for talk in talks:
	speaker_id = 'CSJ_' + talk.attrs['SpeakerID']
	benchmark_tier = tgio.IntervalTier(speaker_id,[], minT=0.0, maxT=duration)
	reference_word_tier = tgio.IntervalTier(f"{speaker_id} - words",[], minT=0.0, maxT=duration)
	reference_phone_tier = tgio.IntervalTier(f"{speaker_id} - phones",[], minT=0.0, maxT=duration)
	alternate_speaker = None
	if len(talk_to_speaker[talk_name]) > 1:
	alternate_speaker = [x for x in talk_to_speaker[talk_name] if x != int(talk.attrs['SpeakerID'])][0]
	alternate_speaker = f"CSJ_{alternate_speaker}"
	alternate_benchmark_tier = tgio.IntervalTier(alternate_speaker,[], minT=0.0, maxT=duration)
	alternate_reference_word_tier = tgio.IntervalTier(f"{alternate_speaker} - words",[], minT=0.0, maxT=duration)
	alternate_reference_phone_tier = tgio.IntervalTier(f"{alternate_speaker} - phones",[], minT=0.0, maxT=duration)
	utterance_intervals = talk.findAll('IPU')
	has_reference = False
	for utterance in utterance_intervals:
	transcription = ''
	utt_begin = float(utterance.attrs['IPUStartTime'])
	utt_end = float(utterance.attrs['IPUEndTime'])
	channel = 0 if utterance.attrs['Channel'] == 'L' else 1
	long_words = utterance.findAll('LUW')
	skip = False
	for long_word in long_words:
	word = ''
	short_words = long_word.findAll('SUW')
	word_begin = None
	word_end = None
	if skip:
	break
	for short_word in short_words:
	if '(R' in short_word.attrs['OrthographicTranscription'] or \
	'(?' in short_word.attrs['OrthographicTranscription'] or \
	short_word.attrs['OrthographicTranscription'] in {'<FV>'}:
	skip = True
	break
	word += short_word.attrs['OrthographicTranscription']
	moras = short_word.findAll('Mora')
	for mora in moras:
	phonemes = mora.findAll('Phoneme')
	for phoneme in phonemes:
	phones = phoneme.findAll('Phone')
	phone_label = phoneme.attrs["PhonemeEntity"]
	begin = None
	end = None
	for p in phones:
	if p.attrs['PhoneEndTime'] is None:
	continue
	if begin is None:
	begin = float(p.attrs['PhoneStartTime'])
	if p.attrs['PhoneEndTime'] is not None:
	end = float(p.attrs['PhoneEndTime'])
	has_reference = True
	if word_begin is None:
	word_begin = begin
	word_end = end
	if not word.startswith('(D') and begin != end and phones:
	if channel == 0 and alternate_speaker is not None:
	alternate_reference_phone_tier.insertEntry(Interval(begin, end, phone_label))
	else:
	reference_phone_tier.insertEntry(Interval(begin, end, phone_label))
	if skip:
	continue
	word = word.replace(')', '')
	if word.startswith('(D'):
	word = "<cutoff>"
	elif word.startswith('(?'):
	word = "<unk>"
	elif word.startswith('(A'):
	word = word.split(maxsplit=1)[-1].split(';')[0].replace('．', '点')
	while any(word.startswith(x) for x in ['(F', '(M', '(O']):
	word = word.split(maxsplit=1)[-1]
	word = re.sub(r'\(D (\(\?.?\))?.?\)', '<unk>', word)
	word = re.sub(r'\(A ([^;]+?)?;.*?\)', r'\1', word).replace('．', '点')
	word = re.sub(r'\([FM] (.*?)\)', r'\1', word)
	transcription += word
	if word_begin is None:
	continue
	if word == "<cutoff>":
	if channel == 0 and alternate_speaker is not None:
	alternate_reference_phone_tier.insertEntry(Interval(word_begin, word_end, "spn"))
	else:
	reference_phone_tier.insertEntry(Interval(word_begin, word_end, "spn"))
	if channel == 0 and alternate_speaker is not None:
	alternate_reference_word_tier.insertEntry(Interval(word_begin, word_end, word))
	else:
	reference_word_tier.insertEntry(Interval(word_begin, word_end, word))
	if not skip and transcription:
	if channel == 1 and alternate_speaker is not None:
	alternate_benchmark_tier.insertEntry(Interval(utt_begin, utt_end, transcription))
	else:
	benchmark_tier.insertEntry(Interval(utt_begin, utt_end, transcription))
	benchmark_tg.addTier(benchmark_tier)
	reference_tg.addTier(reference_word_tier)
	reference_tg.addTier(reference_phone_tier)
	if alternate_speaker is not None:
	benchmark_tg.addTier(alternate_benchmark_tier)
	reference_tg.addTier(alternate_reference_word_tier)
	reference_tg.addTier(alternate_reference_phone_tier)
	if has_reference:
	reference_tg.save(
	str(reference_path),
	"long_textgrid",
	includeBlankSpaces=True)
	benchmark_tg.save(
	str(benchmark_path),
	"long_textgrid",
	includeBlankSpaces=True)
	benchmark_tg.save(
	str(training_path),
	"long_textgrid",
	includeBlankSpaces=True)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	prog='create_seoul_benchmark',
	description='Creates two directories of TextGrid files for use with MFA, '
	'one as input with utterances (benchmark) and one for use in reference alignments (reference)')
	parser.add_argument('original_directory')
	parser.add_argument('benchmark_directory')
	parser.add_argument('reference_directory')
	parser.add_argument('training_directory')

	args = parser.parse_args()
	parse_directory(
	Path(args.original_directory),
	Path(args.benchmark_directory),
	Path(args.reference_directory),
	Path(args.training_directory),
	)