Spaces:

CS5647Team3
/

Mandarin_Tone_Evaluation

Sleeping

App Files Files Community

sssssy commited on Nov 24, 2023

Commit

e96d01f

1 Parent(s): d2902aa

Update dataset.py

Browse files

Files changed (1) hide show

dataset.py +10 -82

dataset.py CHANGED Viewed

@@ -41,7 +41,6 @@ def read_content(filepath):
             for i in range(1, len(tmp)):
                 if len(tmp[i]) == 0:
                     continue
-                # need blank space or not?
                 if i % 2 == 0:
                     pinyin += tmp[i] + ' '
                     tones += tmp[i][-1] + ' '
@@ -73,7 +72,7 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
             if len(tmp) != 0:
                 durations[tmp[0]] = float(tmp[1])
-    audio_path = os.path.join(filepath, 'wav')#这里要删掉
     indexes = []
     for root, dirs, files in os.walk(audio_path):
         for f in files:
@@ -82,7 +81,6 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
                 index = f[0:len(f)-4]
                 filepath = os.path.join(audio_path, index[0:len(index)-4], f)
                 word, py, tone = features[index]
-                # du = librosa.get_duration(filename=filepath)
                 du = durations[index]
                 indexes.append((index, filepath, word, py, tone, du))
@@ -90,62 +88,8 @@ def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
     print('#wav file read:', count)
     print('read dataset index time: ', end_time - start_time)
-    '''indexes = sorted(indexes, key=lambda x: x[0])
-    with open('./durations.txt', 'w') as f:
-        for i in indexes:
-            f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
-    return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
-def read_dataset_index(filepath='/kaggle/input/paddle-speech/AISHELL-3/train'):
-    '''
-    get all audio files' index and file paths
-    read content.txt to get corresponding words, pinyin, tones, duration
-    return dataframe:
-    ['index', 'filepath', 'word', 'pinyin', 'tone', 'duration']
-    5 tones in total, 5 represents neutral tone
-    '''
-    features = read_content(os.path.join(filepath, 'content.txt'))
-    start_time = time.time()
-    count = 0
-    durations = {}
-    with open('/kaggle/input/durations/durations.txt', 'r') as f:
-        lines = f.readlines()
-        for l in lines:
-            tmp = (l.replace('\n', '')).split(' ')
-            if len(tmp) != 0:
-                durations[tmp[0]] = float(tmp[1])
-    audio_path = os.path.join(filepath, 'wav')#这里要删掉
-    indexes = []
-    for root, dirs, files in os.walk(audio_path):
-        for f in files:
-            if f.endswith('.wav'):
-                count += 1
-                index = f[0:len(f)-4]
-                filepath = os.path.join(audio_path, index[0:len(index)-4], f)
-                word, py, tone = features[index]
-                # du = librosa.get_duration(filename=filepath)
-                du = durations[index]
-                indexes.append((index, filepath, word, py, tone, du))
-    end_time = time.time()
-    print('#wav file read:', count)
-    print('read dataset index time: ', end_time - start_time)
-    '''indexes = sorted(indexes, key=lambda x: x[0])
-    with open('./durations.txt', 'w') as f:
-        for i in indexes:
-            f.write(i[0]+ ' ' + str(i[5]) + '\n')'''
     return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
 def collate_fn(batch):
     inp = []
     f0 = []
@@ -194,7 +138,7 @@ def get_data_loader(split, args):
 class MyDataset(Dataset):
     def __init__(self, dataset_root, split, sampling_rate, sample_length, frame_size):
         self.dataset_root = dataset_root
-        self.split = split # train or test
         self.sampling_rate = sampling_rate
         self.sample_length = sample_length
         self.frame_size = frame_size
@@ -202,7 +146,7 @@ class MyDataset(Dataset):
         # self.annotations = get_annotations(get_all_file_names(os.path.join(self.dataset_root, 'AISHELL-3', split)), level='word')
-        self.dataset_index = read_dataset_index(os.path.join(self.dataset_root, 'AISHELL-3', split)) # maybe can be removed
         self.duration = {}
         self.index = self.index_data()
@@ -223,15 +167,11 @@ class MyDataset(Dataset):
         go through self.dataset_index to get duration and then calculate
         '''
-        # duration already in dataset_index
-        # TODO
-        # pass
         index = []
         for indexs, row in self.dataset_index.iterrows():
             duration = row['duration']
             num_seg = math.ceil(duration / self.sample_length)
             for i in range(num_seg):
-                # index.append([row['index'], i * self.sample_length])
                 index.append([indexs, i * self.sample_length])
             self.duration[row['index']] = row['duration']
@@ -249,13 +189,9 @@ class MyDataset(Dataset):
         '''
         audio_fn, start_sec = self.index[idx]
         end_sec = start_sec + self.sample_length
-        # print(start_sec, end_sec)
-        #???
         audio_fp = self.dataset_index.loc[audio_fn,'filepath']
-        # audio_fp = jpath('./dataset/AISHELL-3/train/wav/SSB0005/SSB0005',audio_fp,'.wav')
-        #/kaggle/input/paddle-speech/AISHELL-3/train/wav/SSB0005/SSB00050001.wav
-        # TODO: calculate mel spectrogram
         mel = None
         #load data from file
         waveform, sample_rate = torchaudio.load(audio_fp)
@@ -264,18 +200,16 @@ class MyDataset(Dataset):
         mel_spec = torch.mean(mel_spec,0)
         # print(mel_spec.shape)
-        # TODO: calculate fundamental frequency
         f0 = None
         waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
         f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
-        # get labels???
         # word_roll, tone_roll = self.get_labels(self.annotations[self.dataset_index.loc[audio_fn, 'index']], self.dataset_index.loc[audio_fn,'duration'])
         words = self.dataset_index.loc[audio_fn, 'pinyin']
         w = words.split(' ')
         word_roll = []
-        for i in range(0, len(w)):
             if len(w[i]) != 0:
                 if self.pinyin.get(w[i][0:-1]) == None:
                     self.pinyin[w[i][0:-1]] = len(self.pinyin)
@@ -289,14 +223,12 @@ class MyDataset(Dataset):
         spectrogram_clip = None
         f0_clip = None
-        onset_clip = None
-        offset_clip = None
         word_clip = None
         tone_clip = None
-        # TODO: create clips
         start_frame = int(start_sec * self.frame_per_sec)
-        end_frame = start_frame + 1600 #int(end_sec * self.frame_per_sec)
         # print(start_frame, end_frame)
         spectrogram_clip = mel_spec[:, start_frame:end_frame].T
         f0_clip = f0[start_sec:end_sec]
@@ -304,7 +236,6 @@ class MyDataset(Dataset):
         #tone_clip = tone_roll[start_frame:end_frame]
         # print(tone_roll)
-        #return spectrogram_clip, f0_clip, onset_clip, offset_clip, pinyin_clip, tone_clip
         return spectrogram_clip, f0_clip, torch.Tensor(word_roll), torch.Tensor(tone_roll) #word_clip, tone_clip
     def get_labels(self, annotation_data, duration):
@@ -312,14 +243,11 @@ class MyDataset(Dataset):
         This function read annotation from file, and then convert annotation from note-level to frame-level
         Because we will be using frame-level labels in training.
         '''
-        # TODO
-        # pass
         frame_num = math.ceil(duration * self.frame_per_sec)
         word_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
         tone_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
-        # f0_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
-        # mel_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
         for note in annotation_data:
             start_time, end_time, mark = note  # Assuming annotation format: (start_time, end_time, pitch)
@@ -333,7 +261,7 @@ class MyDataset(Dataset):
             #print(start_frame, end_frame)
             # WORD LEVEL Mark the frames corresponding to the note
-            word_roll[start_frame:end_frame+1] = self.pinyin[mark[:-1]] #mark[:-1]
             tone_roll[start_frame:end_frame+1] = int(mark[-1])
         # print(tone_roll)
         return word_roll, tone_roll

             for i in range(1, len(tmp)):
                 if len(tmp[i]) == 0:
                     continue
                 if i % 2 == 0:
                     pinyin += tmp[i] + ' '
                     tones += tmp[i][-1] + ' '
             if len(tmp) != 0:
                 durations[tmp[0]] = float(tmp[1])
+    audio_path = os.path.join(filepath, 'wav')
     indexes = []
     for root, dirs, files in os.walk(audio_path):
         for f in files:
                 index = f[0:len(f)-4]
                 filepath = os.path.join(audio_path, index[0:len(index)-4], f)
                 word, py, tone = features[index]
                 du = durations[index]
                 indexes.append((index, filepath, word, py, tone, du))
     print('#wav file read:', count)
     print('read dataset index time: ', end_time - start_time)
     return pd.DataFrame.from_records(indexes, columns=['index', 'filepath', 'word', 'pinyin', 'tone', 'duration'])
 def collate_fn(batch):
     inp = []
     f0 = []
 class MyDataset(Dataset):
     def __init__(self, dataset_root, split, sampling_rate, sample_length, frame_size):
         self.dataset_root = dataset_root
+        self.split = split
         self.sampling_rate = sampling_rate
         self.sample_length = sample_length
         self.frame_size = frame_size
         # self.annotations = get_annotations(get_all_file_names(os.path.join(self.dataset_root, 'AISHELL-3', split)), level='word')
+        self.dataset_index = read_dataset_index(os.path.join(self.dataset_root, 'AISHELL-3', split))
         self.duration = {}
         self.index = self.index_data()
         go through self.dataset_index to get duration and then calculate
         '''
         index = []
         for indexs, row in self.dataset_index.iterrows():
             duration = row['duration']
             num_seg = math.ceil(duration / self.sample_length)
             for i in range(num_seg):
                 index.append([indexs, i * self.sample_length])
             self.duration[row['index']] = row['duration']
         '''
         audio_fn, start_sec = self.index[idx]
         end_sec = start_sec + self.sample_length
         audio_fp = self.dataset_index.loc[audio_fn,'filepath']
         mel = None
         #load data from file
         waveform, sample_rate = torchaudio.load(audio_fp)
         mel_spec = torch.mean(mel_spec,0)
         # print(mel_spec.shape)
+        # calculate fundamental frequency
         f0 = None
         waveform, sr = librosa.load(audio_fp, sr=self.sampling_rate)
         f0 = torch.from_numpy(librosa.yin(waveform, fmin=50, fmax=550, hop_length=100))
         # word_roll, tone_roll = self.get_labels(self.annotations[self.dataset_index.loc[audio_fn, 'index']], self.dataset_index.loc[audio_fn,'duration'])
         words = self.dataset_index.loc[audio_fn, 'pinyin']
         w = words.split(' ')
         word_roll = []
+        for i in range(0, len(w)):
             if len(w[i]) != 0:
                 if self.pinyin.get(w[i][0:-1]) == None:
                     self.pinyin[w[i][0:-1]] = len(self.pinyin)
         spectrogram_clip = None
         f0_clip = None
         word_clip = None
         tone_clip = None
+        # create clips
         start_frame = int(start_sec * self.frame_per_sec)
+        end_frame = start_frame + 1600
         # print(start_frame, end_frame)
         spectrogram_clip = mel_spec[:, start_frame:end_frame].T
         f0_clip = f0[start_sec:end_sec]
         #tone_clip = tone_roll[start_frame:end_frame]
         # print(tone_roll)
         return spectrogram_clip, f0_clip, torch.Tensor(word_roll), torch.Tensor(tone_roll) #word_clip, tone_clip
     def get_labels(self, annotation_data, duration):
         This function read annotation from file, and then convert annotation from note-level to frame-level
         Because we will be using frame-level labels in training.
         '''
         frame_num = math.ceil(duration * self.frame_per_sec)
         word_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
         tone_roll = torch.zeros(size=(frame_num + 1,), dtype=torch.long)
         for note in annotation_data:
             start_time, end_time, mark = note  # Assuming annotation format: (start_time, end_time, pitch)
             #print(start_frame, end_frame)
             # WORD LEVEL Mark the frames corresponding to the note
+            word_roll[start_frame:end_frame+1] = self.pinyin[mark[:-1]]
             tone_roll[start_frame:end_frame+1] = int(mark[-1])
         # print(tone_roll)
         return word_roll, tone_roll