| import json | |
| import re | |
| import csv | |
| import shutil | |
| import os | |
| import argparse | |
| main_path = os.getcwd() | |
| def get_duration(row): | |
| phone_durs = row.split() | |
| dur_sum = 0 | |
| for phone_dur in phone_durs: | |
| if phone_dur == '|': | |
| continue | |
| else: | |
| phone_dur = phone_dur.split('[') | |
| dur = float(phone_dur[1][:-1])/1000 | |
| dur_sum += dur | |
| return dur_sum | |
| def prepare_data_for_model(path, duration_lim): | |
| f = open(path, 'r') | |
| data = csv.DictReader(f) | |
| data_lines = [] | |
| for row in data: | |
| dur = get_duration(row['phenome']) | |
| if dur > duration_lim: | |
| continue | |
| phoneme = row['phenome'] | |
| utterance_name = row['seg_id'] | |
| speaker_id = row['speaker_id'] | |
| phoneme = re.sub("\[([0-9]+)\]", '', phoneme) | |
| phoneme = re.sub("\s+\|\s+", ' ', phoneme) | |
| data_lines.append([phoneme, utterance_name, speaker_id]) | |
| f.close() | |
| return data_lines | |
| def save_files(train_data, test_data, data_path): | |
| for line in train_data: | |
| try: | |
| original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1])) | |
| target = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) | |
| os.makedirs(os.path.dirname(target), exist_ok=True) | |
| shutil.copyfile(original, target) | |
| except Exception as e: | |
| print(e) | |
| return False | |
| path = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) | |
| with open(path, 'w') as fp: | |
| fp.write(line[0]) | |
| for line in test_data: | |
| try: | |
| original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1])) | |
| target = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) | |
| os.makedirs(os.path.dirname(target), exist_ok=True) | |
| shutil.copyfile(original, target) | |
| except Exception as e: | |
| print(e) | |
| return False | |
| path = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) | |
| with open(path, 'w') as fp: | |
| fp.write(line[0]) | |
| return True | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--data_path', required=True) | |
| args = parser.parse_args() | |
| data_path = args.data_path | |
| if os.path.isfile(os.path.join(data_path, 'train_info.csv')): | |
| train_data_path = os.path.join(data_path, 'train_info.csv') | |
| else: | |
| print('data_path is not correct!') | |
| return -1 | |
| if os.path.isfile(os.path.join(data_path, 'test_info.csv')): | |
| test_data_path = os.path.join(data_path, 'test_info.csv') | |
| else: | |
| print('data_path is not correct!') | |
| return -1 | |
| train_data = prepare_data_for_model(train_data_path, 12) | |
| test_data = prepare_data_for_model(test_data_path, 15) | |
| print('number of train data: ' + str(len(train_data))) | |
| print('number of test data: ' + str(len(test_data))) | |
| res = save_files(train_data, test_data, data_path) | |
| if res: | |
| print('Data is created.') | |
| if __name__ == "__main__": | |
| main() | |