Spaces:
Sleeping
Sleeping
| import torchaudio | |
| class TextTransform: | |
| def __init__(self): | |
| char_map_str = """ | |
| ' 0 | |
| <SPACE> 1 | |
| a 2 | |
| b 3 | |
| c 4 | |
| d 5 | |
| e 6 | |
| f 7 | |
| g 8 | |
| h 9 | |
| i 10 | |
| j 11 | |
| k 12 | |
| l 13 | |
| m 14 | |
| n 15 | |
| o 16 | |
| p 17 | |
| q 18 | |
| r 19 | |
| s 20 | |
| t 21 | |
| u 22 | |
| v 23 | |
| w 24 | |
| x 25 | |
| y 26 | |
| z 27 | |
| """ | |
| self.char_map = {} | |
| self.index_map = {} | |
| for line in char_map_str.strip().split('\n'): | |
| ch, index = line.split() | |
| self.char_map[ch] = int(index) | |
| self.index_map[int(index)] = ch | |
| self.index_map[1] = ' ' | |
| def text_to_int(self, text): | |
| int_sequence = [] | |
| for c in text: | |
| if c == ' ': | |
| ch = self.char_map['<SPACE>'] | |
| else: | |
| ch = self.char_map[c] | |
| int_sequence.append(ch) | |
| return int_sequence | |
| def int_to_text(self,labels): | |
| string = [] | |
| for i in labels: | |
| string.append(self.index_map[i]) | |
| return ''.join(string).replace('<SPACE>', ' ') | |
| from torch import nn | |
| trainaudio_transforms = nn.Sequential( | |
| torchaudio.transforms.MelSpectrogram(sample_rate = 16000, n_mels = 128), | |
| torchaudio.transforms.FrequencyMasking(freq_mask_param = 15), | |
| torchaudio.transforms.TimeMasking(time_mask_param = 35)) | |
| text_transform = TextTransform() | |
| import torch.nn.functional as F | |
| class CNNLayerNorm(nn.Module): | |
| def __init__(self, n_feats): | |
| super(CNNLayerNorm, self).__init__() | |
| self.layer_norm = nn.LayerNorm(n_feats) | |
| def forward(self, x): | |
| x = x.transpose(2,3).contiguous() | |
| x = self.layer_norm(x) | |
| return x.transpose(2,3).contiguous() | |
| class ResidualCNN(nn.Module): | |
| def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats): | |
| super(ResidualCNN, self).__init__() | |
| self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding = kernel//2) | |
| self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding = kernel//2) | |
| self.dropout1 = nn.Dropout(dropout) | |
| self.dropout2 = nn.Dropout(dropout) | |
| self.layernorm1 = CNNLayerNorm(n_feats) | |
| self.layernorm2 = CNNLayerNorm(n_feats) | |
| def forward(self, x): | |
| residual = x | |
| x = self.layernorm1(x) | |
| x = self.dropout1(x) | |
| x = F.gelu(x) | |
| x = self.cnn1(x) | |
| x = self.layernorm2(x) | |
| x = self.dropout2(x) | |
| x = F.gelu(x) | |
| x = self.cnn2(x) | |
| x += residual | |
| return x | |
| class BiDirectionalGRU(nn.Module): | |
| def __init__(self, rnn_dim, hidden_size, dropout, batch_first): | |
| super(BiDirectionalGRU, self).__init__() | |
| self.BiGRU = nn.GRU( | |
| input_size = rnn_dim, hidden_size = hidden_size, | |
| num_layers = 1, batch_first = batch_first, bidirectional = True) | |
| self.layernorm = nn.LayerNorm(rnn_dim) | |
| self.dropout = nn.Dropout(dropout) | |
| def forward(self, x): | |
| x = self.layernorm(x) | |
| x = F.gelu(x) | |
| x, _ = self.BiGRU(x) | |
| x = self.dropout(x) | |
| return x | |
| class SpeechRecognitionModel(nn.Module): | |
| def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride = 2, dropout = 0.1): | |
| super(SpeechRecognitionModel, self).__init__() | |
| n_feats = n_feats//2 | |
| self.cnn = nn. Conv2d(1, 32, 3, stride = stride, padding = 3//2) | |
| self.rescnn_layers = nn.Sequential(*[ | |
| ResidualCNN(32, 32, kernel = 3, stride = 1, dropout = dropout, n_feats = n_feats) | |
| for _ in range(n_cnn_layers) | |
| ]) | |
| self.fully_connected = nn.Linear(n_feats*32, rnn_dim) | |
| self.birnn_layers = nn.Sequential(*[ | |
| BiDirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2, | |
| hidden_size=rnn_dim, dropout=dropout, batch_first=i==0) | |
| for i in range(n_rnn_layers) | |
| ]) | |
| self.classifier = nn.Sequential( | |
| nn.Linear(rnn_dim*2, rnn_dim), | |
| nn.GELU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(rnn_dim, n_class)) | |
| def forward(self, x): | |
| x = self.cnn(x) | |
| x = self.rescnn_layers(x) | |
| sizes = x.size() | |
| x = x.view(sizes[0], sizes[1]*sizes[2], sizes[3]) | |
| x = x.transpose(1,2) | |
| x = self.fully_connected(x) | |
| x= self.birnn_layers(x) | |
| x = self.classifier(x) | |
| return x | |
| import torch | |
| import os | |
| from pathlib import Path | |
| learning_rate=5e-4 | |
| batch_size=16 | |
| epochs=5 | |
| libri_train_set = "train-clean-100" | |
| libri_test_set = "test-clean" | |
| hparams = { | |
| "n_cnn_layers": 3, | |
| "n_rnn_layers": 5, | |
| "rnn_dim": 512, | |
| "n_class": 29, | |
| "n_feats": 128, | |
| "stride":2, | |
| "dropout": 0.1, | |
| "learning_rate": learning_rate, | |
| "batch_size": batch_size, | |
| "epochs": epochs | |
| } | |