| | import os |
| | import options |
| | import pronouncing |
| |
|
| | from tqdm.auto import tqdm |
| | from typing import List |
| | from dataset import GridDataset |
| |
|
| | base = os.path.abspath('..') |
| | anno_dir = os.path.join(base, options.alignments_dir) |
| | phonemes_dir = os.path.join(base, options.phonemes_dir) |
| | images_dir = os.path.join(base, options.images_dir) |
| | datasets_filenames = ['overlap_train.txt', 'overlap_val.txt'] |
| |
|
| | max_vid_len = 0 |
| | max_text_len = 0 |
| | max_phonemes_len = 0 |
| |
|
| | for datasets_filename in datasets_filenames: |
| | datasets_filepath = os.path.join(base, 'data', datasets_filename) |
| | new_datasets_filepath = os.path.join( |
| | base, 'data', 'phonemes_' + datasets_filename |
| | ) |
| |
|
| | video_filepaths = open(datasets_filepath, 'r').readlines() |
| | valid_filepaths = [] |
| |
|
| | for video_filepath in tqdm(video_filepaths): |
| | video_filepath = video_filepath.strip() |
| | basename = os.path.basename(video_filepath) |
| | parts = video_filepath.split('/') |
| | speaker_dirname = parts[0] |
| |
|
| | align_file = os.path.join( |
| | anno_dir, speaker_dirname, f'{basename}.align' |
| | ) |
| | vid_images_dir = os.path.join( |
| | images_dir, speaker_dirname, basename |
| | ) |
| | new_video_filepath = os.path.join( |
| | options.video_dir, speaker_dirname, f'{basename}.mpg' |
| | ) |
| |
|
| | image_filenames = os.listdir(vid_images_dir) |
| | image_filenames = [ |
| | filename for filename in image_filenames |
| | if filename.endswith('.jpg') |
| | ] |
| |
|
| | |
| | |
| |
|
| | vid_len = len(image_filenames) |
| |
|
| | try: |
| | sentence: List[str] = GridDataset.load_sentence( |
| | align_file, char_map=options.text_char_map |
| | ) |
| | except FileNotFoundError: |
| | continue |
| |
|
| | text_len = len(sentence) |
| | sentence_str = ''.join(sentence) |
| | phonemes_sentence = GridDataset.text_to_phonemes( |
| | sentence_str, as_str=False |
| | ) |
| |
|
| | phonemes_len = len(phonemes_sentence) |
| | |
| |
|
| | max_vid_len = max(vid_len, max_vid_len) |
| | max_text_len = max(text_len, max_text_len) |
| | max_phonemes_len = max(phonemes_len, max_phonemes_len) |
| | assert ( |
| | (max_vid_len > 2 * max_text_len) and |
| | (max_vid_len > 2 * max_phonemes_len) |
| | ) |
| |
|
| | valid_filepaths.append(new_video_filepath) |
| |
|
| | open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths)) |
| | print('new valid filepaths written to:', new_datasets_filepath) |
| |
|
| |
|
| | print('MAX_VID_LEN', max_vid_len) |
| | print('MAX_TEXT_LEN', max_text_len) |
| | print('MAX_PHONEMES_LEN', max_phonemes_len) |