MahaTTSv2 / T2S /dataset.py

rasenganai

init

41bc8a8 7 months ago

20.4 kB

	import os
	import sys
	from typing import Any

	sys.path.append("../")
	import linecache
	import mmap
	import pickle as pkl
	import random

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	import torchaudio
	import transformers
	from accelerate import Accelerator, DistributedDataParallelKwargs
	from autoregressive import TS_model
	from cleaners import english_cleaners
	from librosa.filters import mel as librosa_mel_fn
	from mel_spec import get_mel_spectrogram
	from meta_stats import process_file, process_file_for_heads
	from stft import STFT
	from torch.utils.data import (DataLoader, Dataset, WeightedRandomSampler,
	get_worker_info)
	from tqdm import tqdm
	from utilities import get_mask_from_lengths

	import wandb
	from config import config
	from Text import code_labels, labels, text_labels

	torch.manual_seed(config.seed_value)
	np.random.seed(config.seed_value)
	random.seed(config.seed_value)
	print(text_labels)
	# add semantic tokens:
	# tok_enc = {j:i for i,j in enumerate(labels)}
	# tok_dec = {j:i for i,j in enumerate(labels)}

	# code encdec
	text_enc = {j: i for i, j in enumerate(text_labels)}
	text_dec = {i: j for i, j in enumerate(text_labels)}

	# text encdec
	code_enc = {j: i for i, j in enumerate(code_labels)}
	code_dec = {i: j for i, j in enumerate(code_labels)}


	def read_specific_line(filename, line_number):
	line = linecache.getline(filename, line_number)
	return line.strip() # Remove any leading or trailing whitespace


	CLIP_LENGTH = config.CLIP_LENGTH


	class semantic_dataset_batch(Dataset):
	def __init__(
	self,
	transcript_path,
	semantic_path=None,
	ref_mels_path=None,
	ref_k=3,
	scale=False,
	process_id=None,
	total_processes=None,
	):
	super().__init__()
	self.scale = scale
	if not scale:
	with open(transcript_path, "r") as file:
	data = file.read().strip("\n").split("\n")[:]

	with open(semantic_path, "r") as file:
	semb = file.read().strip("\n").split("\n")

	with open(ref_mels_path, "rb") as file:
	self.ref_mels = pkl.load(file)

	semb = {
	i.split("\t")[0]: [j for j in i.split("\t")[1].split()] for i in semb
	}
	data = {i.split("\|")[0]: i.split("\|")[1].strip().lower() for i in data}

	self.data = [[i, semb[i], data[i]] for i in data.keys()]

	else:
	# with open(transcript_path,'r') as file:
	# get meta for dataset
	# for count, line in enumerate(file):
	# pass
	# count = 80
	print(transcript_path)
	# self.weights,self.count = process_file(transcript_path)
	self.heads, self.weights, self.count = process_file_for_heads(
	transcript_path, total_processes, process_id
	)
	print("length :", self.count)
	self.data_len = self.count
	self.transcript_path = transcript_path
	line_index = {}
	with open(transcript_path, "rb") as file:
	mmapped_file = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
	line_number = 0
	offset = 0
	while offset < len(mmapped_file):
	line_index[line_number] = offset
	offset = mmapped_file.find(b"\n", offset) + 1
	# print(line_number,offset)
	line_number += 1
	self.mmapped_file = mmapped_file
	self.line_index = line_index

	self.process_id = process_id
	self.total_processes = total_processes
	self.iterator = None

	self.ref_k = ref_k
	self.max_wav_value = config.MAX_WAV_VALUE
	self.stft_fn = STFT(config.filter_length, config.hop_length, config.win_length)

	mel_basis = librosa_mel_fn(
	sr=config.sampling_rate,
	n_fft=config.filter_length,
	n_mels=config.n_mel_channels,
	fmin=config.mel_fmin,
	fmax=config.mel_fmax,
	)

	self.mel_basis = torch.from_numpy(mel_basis).float()

	def get_mel(self, filepath):
	# audio, sampling_rate = load_wav_to_torch(filepath)
	# audio_norm = audio / self.max_wav_value
	audio_norm, sampling_rate = torchaudio.load(filepath)

	# dur = audio_norm.shape[-1]/sampling_rate

	# if dur<0.5:
	# return None,None,None

	# if self.clip and dur>10 and align:
	# # print('big file',dur)
	# max_audio_start = int(dur - 10)
	# audio_start = random.randint(0, max_audio_start)

	# audio_norm = audio_norm[:,audio_startsampling_rate:(audio_start+10)sampling_rate]
	# semb_ids = semb_ids[audio_start50:(audio_start+10)50 -1]

	# 86 mel -> 1s for 22050 setting
	# ` 93 mel ->`1s for 24000 setting

	# add 64ms of values to start and end
	# audio_norm += torch.randn(audio_norm.shape[0])*1e-8
	# audio_norm = torch.concat([torch.randn(1412)1e-8,audio_norm,torch.randn(1412)1e-8])
	# audio_norm = audio_norm.unsqueeze(0)
	# y = torch.autograd.Variable(audio_norm, requires_grad=False)

	# assert(torch.min(y.data) >= -1)
	# assert(torch.max(y.data) <= 1)
	# magnitudes, phases = self.stft_fn.transform(y)
	# magnitudes = magnitudes.data
	# mel_output = torch.matmul(self.mel_basis, magnitudes)
	# mel_output = dynamic_range_compression(mel_output)
	# melspec = torch.squeeze(mel_output, 0)
	# energy = torch.norm(magnitudes, dim=1).squeeze(0)
	# melspec,energy = mel_spectrogram(audio_norm)
	melspec = get_mel_spectrogram(audio_norm, sampling_rate).squeeze(0)
	energy = []
	# if align:
	# return melspec,list(energy),semb_ids
	return melspec, list(energy)

	def __len__(self):
	if self.scale:
	return self.data_len
	return len(self.data)

	# def get_process_heads(self,):
	# '''
	# divide data and heads based on the batch_size and weights
	# '''

	# new_heads ={}
	# new_weights =[]
	# process_batch_size = config.ts_batch_size*config.ts_gradient_accumulation_steps
	# sm=0
	# for i,j in zip(self.heads,self.weights):

	# if sm + j > process_batch_size:
	# if sm+j == process_batch_size:
	# new_heads[i] = self.heads[i]
	# new_weights.append(j)
	# else:
	# new_heads[i] = self.heads[i][:len(self.heads[i])*(process_batch_size-sm)//process_batch_size]
	# new_weights.append(process_batch_size-sm)
	# else:
	# new_heads[i] = self.heads[i]
	# new_weights.append(j)

	# self.get_worker_heads()

	# old heads and weights
	# new_heads = {}
	# for i in self.heads:
	# segment_size = (len(self.heads[i]) + self.total_processes - 1) // self.total_processes
	# start_idx = self.process_id * segment_size
	# end_idx = start_idx + segment_size

	# if end_idx > len(self.heads[i]):
	# # Create a list that wraps around to the beginning
	# segment = self.heads[i][start_idx:] + self.heads[i][:end_idx - len(self.heads[i])]
	# else:
	# segment = self.heads[i][start_idx:end_idx]
	# new_heads[i]=segment
	# self.heads = new_heads
	# print(self.process_id,[len(self.heads[i]) for i in self.heads])
	# self.get_worker_heads()

	def get_worker_heads(
	self,
	):
	self.worker_id = get_worker_info().id
	self.num_worker = get_worker_info().num_workers
	new_heads = {}
	for i in self.heads:
	segment_size = (len(self.heads[i]) + self.num_worker - 1) // self.num_worker
	start_idx = self.worker_id * segment_size
	end_idx = start_idx + segment_size

	if end_idx > len(self.heads[i]):
	# Create a list that wraps around to the beginning
	segment = (
	self.heads[i][start_idx:]
	+ self.heads[i][: end_idx - len(self.heads[i])]
	)
	else:
	segment = self.heads[i][start_idx:end_idx]
	new_heads[i] = segment
	self.heads = new_heads
	# print("worker:",self.worker_id,self.process_id,[len(self.heads[i]) for i in self.heads],self.weights)

	def get_head(self):
	# self.get_process_heads()
	self.get_worker_heads()
	# print("weights:",self.weights,[h for h in self.heads])
	self.indices = [0] * len(self.heads)
	# self.process_heads = [{i:self.heads[i][self.process_id:]}for i in self.heads]
	while True:
	for (
	n,
	(head, weight),
	) in enumerate(zip(self.heads, self.weights)):
	# if process_id == 0:
	# print(weight,head)
	for i in range(weight):
	if self.indices[n] < len(self.heads[head]):
	# print(self.heads[head][self.indices[n]],worker_id,self.indices)
	yield self.heads[head][self.indices[n]]
	self.indices[n] += 1
	else:
	self.indices[n] = 0
	random.shuffle(self.heads[head])
	# shuffle the indices

	def __getitem__(self, index) -> Any:
	if self.iterator is None:
	self.iterator = self.get_head()
	if not self.scale:
	lang, path, semb, text = self.data[index]
	ref_mels = self.ref_mels[path][: self.ref_k]

	else:
	# line = read_specific_line(self.transcript_path,index+1)

	index = next(self.iterator)
	# print(self.worker_id,self.process_id,index)
	self.mmapped_file.seek(self.line_index[index])
	line = self.mmapped_file.readline().decode("utf-8")

	lang, path, text, semb_ids, ref_mels = line.split("\|")
	# a=5/0
	# semb_ids = [int(i)+1 for i in semb_ids.split()]
	semb = semb_ids.split()
	ref_mels = [i.split(",") for i in ref_mels.split("\t")][: self.ref_k]

	if len(semb) < 25:
	if index + 1 < self.data_len:
	return self.__getitem__(index + 1)
	return self.__getitem__(0)

	if len(ref_mels) == 0:
	ref_mels.append((path, 1))
	ref_mels.append((path, 1))
	ref_mels.append((path, 1))

	while len(ref_mels) < self.ref_k:
	ref_mels.append(ref_mels[-1])

	text = text.lower().strip()
	# try:
	text_ids = [text_enc["<S>"]] + [text_enc[i] for i in text] + [text_enc["<E>"]]
	semb_ids = (
	[code_enc["<SST>"]] + [code_enc[i] for i in semb] + [code_enc["<EST>"]]
	)

	# except Exception as e:
	# print(e)
	# print(lang,path,text,index)
	# exit
	# input_ids = text_ids+semb_ids
	# pad_length = config.t2s_position-(len(text_ids)+len(semb_ids))

	# token_type_ids = [0]len(text_ids)+[1]len(semb_ids)+[0]*pad_length
	# positional_ids = [i for i in range(len(text_ids))]+[i for i in range(len(semb_ids))]+[0]*pad_length
	# labels = [-100]len(text_ids)+semb_ids+[-100]pad_length
	# attention_mask = [1]len(input_ids)+[0]pad_length
	# input_ids += [tok_enc['<PAD>']]*pad_length

	def get_random_portion(mel, mask_lengths):
	clip = mask_lengths <= CLIP_LENGTH
	ref_mel = mel[:, :, :CLIP_LENGTH].clone()
	for n, z in enumerate(clip):
	if not z:
	start = np.random.randint(0, mask_lengths[n].item() - CLIP_LENGTH)
	ref_mel[n, :, :] = mel[n, :, start : start + CLIP_LENGTH].clone()
	return ref_mel

	try:
	ref_mels = [self.get_mel(path)[0] for path, score in ref_mels]
	except Exception as e:
	print(index, e)
	if index + 1 < self.data_len:
	return self.__getitem__(index + 1)
	return self.__getitem__(0)

	ref_c = []
	for i in range(self.ref_k):
	if ref_mels[i] is None:
	continue
	ref_c.append(ref_mels[i])

	if len(ref_c) == 0:
	# print('no refs worthy')
	if index + 1 < self.data_len:
	return self.__getitem__(index + 1)
	return self.__getitem__(0)

	if len(ref_c) != self.ref_k:
	# print('less refs found',len(ref_c))
	while len(ref_c) < self.ref_k:
	ref_c.append(ref_c[-1])

	ref_mels = ref_c

	max_target_len = max([x.size(1) for x in ref_mels])
	ref_mels_padded = (
	torch.randn((self.ref_k, config.n_mel_channels, max_target_len))
	) * 1e-9
	mel_length = []
	for i, mel in enumerate(ref_mels):
	ref_mels_padded[i, :, : mel.size(1)] = mel
	mel_length.append(mel.shape[-1])

	ref_mels = get_random_portion(ref_mels_padded, torch.tensor(mel_length))

	return {
	"text_ids": text_ids,
	"semb_ids": semb_ids,
	"ref_mels": ref_mels,
	"lang": torch.tensor(config.lang_index[lang]),
	}


	# def get_padded_seq(sequences):

	# max_len=max([len(s) for s in sequences])
	# for i in range(len(sequences)):
	# sequences[i]=sequences[i]+tok_enc['<PAD>']*(max_len-len(sequences[i]))

	# return sequences


	def get_padded_seq(sequences, pad_random, before=False, pad__=0):
	max_len = max([len(s) for s in sequences])
	seq_len = []
	for i in range(len(sequences)):
	seq_len.append(len(sequences[i]))
	if pad_random:
	pad_ = pad_ = list((np.random.rand(max_len - len(sequences[i]))) * 1e-9)
	else:
	pad_ = [pad__] * (max_len - len(sequences[i]))
	if not before:
	sequences[i] = sequences[i] + pad_
	else:
	sequences[i] = pad_ + sequences[i]

	return sequences, seq_len


	def collate(batch):
	text_ids = []
	semb_ids = []
	# paths=[]
	ref_mels = []
	langs = []
	# ref_mels_length=[]

	for b in batch:
	text_ids.append(b["text_ids"])
	semb_ids.append(b["semb_ids"])
	# paths.append(b['path'])
	ref_mels.append(b["ref_mels"])
	langs.append(b["lang"])
	# ref_mels_length.append(b['ref_mel_length'])

	text_ids, text_len = get_padded_seq(
	text_ids, pad_random=False, before=False, pad__=text_enc["<E>"]
	)
	code, code_len = get_padded_seq(semb_ids, pad_random=False, pad__=code_enc["<EST>"])

	ref_max_target_len = max([x.size(-1) for x in ref_mels])
	ref_mels_padded = (
	torch.randn(
	(
	len(batch),
	ref_mels[0].shape[0],
	config.n_mel_channels,
	ref_max_target_len,
	)
	)
	) * 1e-9

	for i, mel in enumerate(ref_mels):
	ref_mels_padded[i, :, :, : mel.size(-1)] = mel

	# print(mel_padded.shape,torch.tensor(code).shape,torch.tensor(mel_length),get_mask_from_lengths(torch.tensor(mel_length)))

	return (
	torch.tensor(text_ids),
	torch.tensor(code),
	torch.tensor(text_len),
	torch.tensor(code_len),
	ref_mels_padded,
	torch.tensor(langs),
	)


	def get_dataset(transcript_path, get_process_id, total_processes):
	return semantic_dataset_batch(
	transcript_path,
	scale=True,
	process_id=get_process_id,
	total_processes=total_processes,
	)


	if __name__ == "__main__":
	accelerator = Accelerator(
	gradient_accumulation_steps=config.ts_gradient_accumulation_steps
	) # ,kwargs_handlers=[ddp_kwargs]) mixed_precision="fp16",

	get_process_id = accelerator.process_index
	total_processes = accelerator.num_processes

	# train_dataset_ = semantic_dataset_batch(config.data_path+'/transcript_train_20s_final_normalized_filtered.txt','../'+config.data_path+'/semt.txt','../'+config.data_path+'/ref_clips.pkl',
	# scale=True,process_id=get_process_id,total_processes = total_processes)
	# train_dataset_ = semantic_dataset_batch(config.data_path+'/transcript_train_20s_final_normalized_filtered.txt','../'+config.data_path+'/semt.txt','../'+config.data_path+'/ref_clips.pkl',
	# scale=True,process_id=get_process_id,total_processes = total_processes)
	# train_dataset_ = semantic_dataset_batch(config.data_path+'/transcript_train_20s_final_normalized_filtered.txt','../'+config.data_path+'/semt.txt','../'+config.data_path+'/ref_clips.pkl',
	# scale=True,process_id=get_process_id,total_processes = total_processes)
	train_dataset_ = semantic_dataset_batch(
	config.data_path + "/transcript_train_20s_final_normalized_filtered.txt",
	"../" + config.data_path + "/semt.txt",
	"../" + config.data_path + "/ref_clips.pkl",
	scale=True,
	process_id=get_process_id,
	total_processes=total_processes,
	)
	# sampler = WeightedRandomSampler(
	# train_dataset_.weights,
	# train_dataset_.count,
	# replacement=False)
	train_dataset = DataLoader(
	train_dataset_,
	pin_memory=True,
	persistent_workers=True,
	num_workers=config.ts_num_workers,
	batch_size=config.ts_batch_size,
	shuffle=False,
	drop_last=False,
	collate_fn=collate,
	sampler=None,
	)
	print("batch", config.ts_batch_size)
	# val_dataset = DataLoader(semantic_dataset_batch(config.data_path+'/transcript_test_20_final_normalized.txt','../'+config.data_path+'/semt.txt','../'+config.data_path+'/ref_clips.pkl',scale=True,process_id=get_process_id,total_processes = total_processes),pin_memory=True,
	# persistent_workers=True,num_workers=2,batch_size=config.ts_batch_size,shuffle=True,drop_last=False,collate_fn=collate)

	train_dataloader = accelerator.prepare(train_dataset)
	# if accelerator.is_local_main_process:
	# from IPython import embed
	# embed()

	# checkiong the sampler working
	import math
	from collections import defaultdict

	def calculate_duration(code_len):
	return math.ceil(((code_len + 1) / 50) * 2) / 2

	sampling = defaultdict(int)
	dataset = []
	batch_data = {}
	batch = 0
	batch_data[batch] = defaultdict(int)
	for n, data in enumerate(tqdm(train_dataloader)):
	# break
	text_ids, code, text_len, code_len, ref_clips, langs = data
	# print(text_ids)
	# print('=====')
	# # break
	for i, j in zip(code_len, text_ids):
	dur = calculate_duration(i - 2)
	# print(dur,i,code.shape)
	# sampling[calculate_duration(i)]+=1
	dataset.append(list(j.detach().cpu().numpy()))

	if dur > 19.5:
	batch_data[batch]["20_sentence"] += 1
	continue
	if dur <= 5:
	batch_data[batch]["5s"] += 1
	continue
	elif dur <= 10:
	batch_data[batch]["10s"] += 1
	continue
	elif dur <= 15:
	batch_data[batch]["15s"] += 1
	continue
	elif dur <= 20:
	batch_data[batch]["20s"] += 1
	continue
	# print(batch)
	if (n + 1) % config.ts_gradient_accumulation_steps == 0:
	batch += 1
	batch_data[batch] = defaultdict(int)
	# break
	# if n==20:
	# break
	# # print(sampling)
	with open(
	f"Sampling_data_meta/sampling_{accelerator.process_index}.pkl", "wb"
	) as file:
	pkl.dump(batch_data, file)
	with open(
	f"Sampling_data_meta/sampling_dataset_{accelerator.process_index}.pkl", "wb"
	) as file:
	pkl.dump(dataset, file)
	print(batch_data[0])
	# # # return 0