Zolisa commited on
Commit
09ca3d4
·
verified ·
1 Parent(s): 52b1c1a

Upload folder using huggingface_hub

Browse files
utils/__pycache__/data_loader.cpython-312.pyc ADDED
Binary file (4.49 kB). View file
 
utils/data_loader.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torchvision import datasets, transforms
3
+ from torch.utils.data import DataLoader
4
+
5
+ def get_mnist_loaders(batch_size=64):
6
+ transform = transforms.Compose([
7
+ transforms.ToTensor(),
8
+ transforms.Normalize((0.1307,), (0.3081,))
9
+ ])
10
+
11
+ train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
12
+ test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
13
+
14
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
15
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
16
+
17
+ return train_loader, test_loader
18
+
19
+ def get_fashion_mnist_loaders(batch_size=64):
20
+ transform = transforms.Compose([
21
+ transforms.ToTensor(),
22
+ transforms.Normalize((0.5,), (0.5,))
23
+ ])
24
+
25
+ train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
26
+ test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
27
+
28
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
29
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
30
+
31
+ return train_loader, test_loader
32
+
33
+ def get_imdb_loaders(batch_size=64, max_len=256, vocab_size=10000):
34
+ from torchtext.datasets import IMDB
35
+ from torchtext.data.utils import get_tokenizer
36
+ from torchtext.vocab import build_vocab_from_iterator
37
+ from torch.utils.data import DataLoader, Dataset
38
+ import torch.nn.utils.rnn as rnn_utils
39
+
40
+ tokenizer = get_tokenizer("basic_english")
41
+ train_iter = IMDB(split='train')
42
+
43
+ def yield_tokens(data_iter):
44
+ for _, text in data_iter:
45
+ yield tokenizer(text)
46
+
47
+ vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"])
48
+ vocab.set_default_index(vocab["<unk>"])
49
+
50
+ def text_pipeline(text):
51
+ return vocab(tokenizer(text))
52
+
53
+ class IMDBDataset(Dataset):
54
+ def __init__(self, split):
55
+ self.data = list(IMDB(split=split))
56
+ self.max_len = max_len
57
+
58
+ def __len__(self):
59
+ return len(self.data)
60
+
61
+ def __getitem__(self, idx):
62
+ label, text = self.data[idx]
63
+ # Convert label: 1 (neg), 2 (pos) -> 0, 1
64
+ label = 0 if label == 1 else 1
65
+ tokens = text_pipeline(text)[:self.max_len]
66
+ # Padding
67
+ if len(tokens) < self.max_len:
68
+ tokens += [vocab["<pad>"]] * (self.max_len - len(tokens))
69
+ return torch.tensor(tokens), torch.tensor(label)
70
+
71
+ train_dataset = IMDBDataset('train')
72
+ test_dataset = IMDBDataset('test')
73
+
74
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
75
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
76
+
77
+ return train_loader, test_loader, len(vocab)