| | import sys |
| |
|
| | sys.path.append('..') |
| |
|
| | import options |
| | import os.path |
| | import pronouncing |
| | import options as opt |
| |
|
| | from Loader import GridLoader |
| | from tqdm.auto import tqdm |
| | from dataset import GridDataset |
| | from typing import List |
| |
|
| | VALID_FILE_EXT = ('.txt', '.align') |
| | EXCLUDED_PHONEMES = ('foreign', 'french') |
| | MAX_VID_LEN = 100 |
| | CTC_SCALE = 2 |
| |
|
| | base = os.path.abspath('..') |
| | anno_dir = os.path.join(base, options.alignments_dir) |
| | phonemes_dir = os.path.join(base, options.phonemes_dir) |
| | images_dir = os.path.join(base, options.images_dir) |
| | speaker_dirnames = sorted(os.listdir(anno_dir)) |
| |
|
| | valid_sentence_pairs = [] |
| | sentence_pairs = [] |
| |
|
| | for speaker_dirname in tqdm(speaker_dirnames): |
| | speaker_dir = os.path.join(anno_dir, speaker_dirname) |
| | filenames = os.listdir(speaker_dir) |
| |
|
| | for filename in filenames: |
| | _, ext = os.path.splitext(filename) |
| | if ext not in VALID_FILE_EXT: |
| | continue |
| |
|
| | align_file = os.path.join(speaker_dir, filename) |
| | sentence_pairs.append((speaker_dirname, filename)) |
| |
|
| | sentence_pairs = sorted(sentence_pairs) |
| | pbar = tqdm(sentence_pairs) |
| | pairs_without_phonemes = 0 |
| | max_valid_vid_len = 0 |
| | max_valid_phonemes_len = 0 |
| |
|
| | unique_phonemes = set() |
| | valid_unique_phonemes = set() |
| | unique_text_chars = set() |
| | unique_words = set() |
| | valid_unique_words = set() |
| | max_length = 0 |
| |
|
| | for sentence_pair in pbar: |
| | speaker_dirname, filename = sentence_pair |
| | basename, _ = os.path.splitext(filename) |
| | align_file = os.path.join(anno_dir, speaker_dirname, filename) |
| |
|
| | pair_str = f'{speaker_dirname}/{basename}' |
| | vid_images_dir = os.path.join(images_dir, speaker_dirname, basename) |
| | image_filenames = os.listdir(vid_images_dir) |
| | image_filenames = [ |
| | filename for filename in image_filenames |
| | if filename.endswith('.jpg') |
| | ] |
| |
|
| | vid_len = len(image_filenames) |
| |
|
| | phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname) |
| | if not os.path.exists(phonemes_speaker_dir): |
| | os.mkdir(phonemes_speaker_dir) |
| |
|
| | phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename) |
| | sentence: List[str] = GridDataset.load_sentence( |
| | align_file, char_map=opt.text_char_map |
| | ) |
| |
|
| | sentence_str = ''.join(sentence) |
| | sentence_words = sentence_str.split(' ') |
| | sentence_phonemes = [] |
| | flat_sentence_phonemes = [] |
| | has_valid_phonemes = True |
| |
|
| | for char in sentence_str: |
| | unique_text_chars.add(char) |
| |
|
| | for word in sentence_words: |
| | phoneme_set = pronouncing.phones_for_word(word) |
| | if len(phoneme_set) == 0: |
| | pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]' |
| | has_valid_phonemes = False |
| | pairs_without_phonemes += 1 |
| | break |
| |
|
| | phonemes = pronouncing.phones_for_word(word)[0] |
| | phonemes = phonemes.split(' ') |
| | assert len(phonemes) > 0 |
| |
|
| | length = 0 |
| | for phoneme in phonemes: |
| | if phoneme in EXCLUDED_PHONEMES: |
| | has_valid_phonemes = False |
| | pairs_without_phonemes += 1 |
| | break |
| |
|
| | unique_phonemes.add(phoneme) |
| |
|
| | if not has_valid_phonemes: |
| | break |
| |
|
| | sentence_phonemes.append(phonemes) |
| | flat_sentence_phonemes.extend(phonemes) |
| | flat_sentence_phonemes.append(' ') |
| |
|
| | unique_words.add(word) |
| | length += len(phonemes) |
| |
|
| | if not has_valid_phonemes: |
| | continue |
| |
|
| | if flat_sentence_phonemes[-1] == ' ': |
| | flat_sentence_phonemes = flat_sentence_phonemes[:-1] |
| |
|
| | is_valid_video = ( |
| | (vid_len > 0) and |
| | (vid_len < MAX_VID_LEN) and |
| | |
| | (vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and |
| | has_valid_phonemes |
| | ) |
| |
|
| | if is_valid_video: |
| | valid_sentence_pairs.append(sentence_pair) |
| | num_flat_phonemes = len(flat_sentence_phonemes) |
| |
|
| | if vid_len > max_valid_vid_len: |
| | max_valid_vid_len = vid_len |
| | if num_flat_phonemes > max_valid_phonemes_len: |
| | max_valid_phonemes_len = num_flat_phonemes |
| |
|
| | for word in sentence_words: |
| | valid_unique_words.add(word) |
| |
|
| | for phonemes in sentence_phonemes: |
| | for phoneme in phonemes: |
| | valid_unique_phonemes.add(phoneme) |
| |
|
| | |
| | |
| | raw_phonemes = '\n'.join([ |
| | ' '.join(phonemes) for phonemes in sentence_phonemes |
| | ]) |
| |
|
| | |
| | if not os.path.exists(phonemes_file): |
| | open(phonemes_file, 'w').write(raw_phonemes) |
| |
|
| | |
| |
|
| | valid_pair_dirs = [] |
| | for sentence_pair in valid_sentence_pairs: |
| | speaker_dirname, filename = sentence_pair |
| | basename, _ = os.path.splitext(filename) |
| | pair_str = f'{speaker_dirname}/{basename}' |
| | valid_pair_dirs.append(pair_str) |
| |
|
| | open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write( |
| | '\n'.join(valid_pair_dirs) |
| | ) |
| |
|
| | print('VALID PAIRS', len(valid_pair_dirs)) |
| | print('VALID UNIQUE WORDS', valid_unique_words) |
| | print('PAIRS W/O PHONEMES', pairs_without_phonemes) |
| | print('UNIQUE PHONEMES', sorted(list(unique_phonemes))) |
| | print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes))) |
| | print('UNIQUE CHARS', sorted(list(unique_text_chars))) |
| | print('MAX VALID PHONEMES LEN', max_valid_phonemes_len) |
| | print('MAX VALID VID LEN', max_valid_vid_len) |
| | print('>>>') |
| | |