ivanleomk commited on
Commit
98d3156
·
verified ·
1 Parent(s): 67b7a1b

Upload train_mnist.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_mnist.py +166 -0
train_mnist.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Links:
2
+ # W&B Run: https://wandb.ai/ivanleo97-freelance/mnist-modal/runs/tu4yqtvi
3
+ # Hugging Face Model: https://huggingface.co/ivanleomk/mnist-modal
4
+
5
+ import modal
6
+
7
+ app = modal.App("mnist-training")
8
+
9
+ # We use an image with torch, torchvision, wandb and huggingface_hub installed.
10
+ # We will import these inline inside the function to respect the user's request
11
+ # and avoid needing them installed locally.
12
+ image = modal.Image.debian_slim().pip_install(
13
+ "torch", "torchvision", "wandb", "huggingface_hub"
14
+ )
15
+
16
+ @app.function(
17
+ image=image,
18
+ gpu="A100",
19
+ timeout=3600,
20
+ secrets=[
21
+ modal.Secret.from_dict({"WANDB_API_KEY": "YOUR_WANDB_API_KEY"}),
22
+ modal.Secret.from_dict({"HF_TOKEN": "YOUR_HF_TOKEN"})
23
+ ]
24
+ )
25
+ def train():
26
+ import torch
27
+ import torch.nn as nn
28
+ import torch.nn.functional as F
29
+ import torch.optim as optim
30
+ from torchvision import datasets, transforms
31
+ import wandb
32
+ import os
33
+ from huggingface_hub import HfApi
34
+
35
+ class Net(nn.Module):
36
+ def __init__(self):
37
+ super(Net, self).__init__()
38
+ self.conv1 = nn.Conv2d(1, 32, 3, 1)
39
+ self.conv2 = nn.Conv2d(32, 64, 3, 1)
40
+ self.dropout1 = nn.Dropout(0.25)
41
+ self.dropout2 = nn.Dropout(0.5)
42
+ self.fc1 = nn.Linear(9216, 128)
43
+ self.fc2 = nn.Linear(128, 10)
44
+
45
+ def forward(self, x):
46
+ x = F.relu(self.conv1(x))
47
+ x = F.relu(self.conv2(x))
48
+ x = F.max_pool2d(x, 2)
49
+ x = self.dropout1(x)
50
+ x = torch.flatten(x, 1)
51
+ x = F.relu(self.fc1(x))
52
+ x = self.dropout2(x)
53
+ x = self.fc2(x)
54
+ return F.log_softmax(x, dim=1)
55
+
56
+ # Initialize W&B
57
+ wandb.init(project="mnist-modal", config={
58
+ "learning_rate": 1.0,
59
+ "epochs": 5,
60
+ "batch_size": 64
61
+ })
62
+
63
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
+ print(f"Using device: {device}")
65
+
66
+ model = Net().to(device)
67
+ # Using Adadelta as in standard pytorch mnist example
68
+ optimizer = optim.Adadelta(model.parameters(), lr=wandb.config.learning_rate)
69
+
70
+ transform = transforms.Compose([
71
+ transforms.ToTensor(),
72
+ transforms.Normalize((0.1307,), (0.3081,))
73
+ ])
74
+
75
+ print("Downloading dataset...")
76
+ train_loader = torch.utils.data.DataLoader(
77
+ datasets.MNIST('/tmp/data', train=True, download=True, transform=transform),
78
+ batch_size=wandb.config.batch_size, shuffle=True)
79
+
80
+ test_loader = torch.utils.data.DataLoader(
81
+ datasets.MNIST('/tmp/data', train=False, transform=transform),
82
+ batch_size=1000, shuffle=True)
83
+
84
+ print("Starting training...")
85
+ for epoch in range(1, wandb.config.epochs + 1):
86
+ model.train()
87
+ train_loss = 0
88
+ for batch_idx, (data, target) in enumerate(train_loader):
89
+ data, target = data.to(device), target.to(device)
90
+ optimizer.zero_grad()
91
+ output = model(data)
92
+ loss = F.nll_loss(output, target)
93
+ loss.backward()
94
+ optimizer.step()
95
+ train_loss += loss.item()
96
+
97
+ if batch_idx % 100 == 0:
98
+ print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} "
99
+ f"({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")
100
+
101
+ train_loss /= len(train_loader)
102
+
103
+ # Test
104
+ model.eval()
105
+ test_loss = 0
106
+ correct = 0
107
+ with torch.no_grad():
108
+ for data, target in test_loader:
109
+ data, target = data.to(device), target.to(device)
110
+ output = model(data)
111
+ test_loss += F.nll_loss(output, target, reduction='sum').item()
112
+ pred = output.argmax(dim=1, keepdim=True)
113
+ correct += pred.eq(target.view_as(pred)).sum().item()
114
+
115
+ test_loss /= len(test_loader.dataset)
116
+ accuracy = 100. * correct / len(test_loader.dataset)
117
+
118
+ print(f"\nEpoch {epoch} summary: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n")
119
+
120
+ wandb.log({
121
+ "epoch": epoch,
122
+ "train_loss": train_loss,
123
+ "test_loss": test_loss,
124
+ "accuracy": accuracy
125
+ })
126
+
127
+ print(f"Final test accuracy: {accuracy:.2f}%")
128
+
129
+ # Save model
130
+ model_path = "/tmp/mnist_model.pth"
131
+ torch.save(model.state_dict(), model_path)
132
+ print(f"Model saved to {model_path}")
133
+
134
+ # Upload to HF
135
+ try:
136
+ api = HfApi()
137
+ user_info = api.whoami(token=os.environ["HF_TOKEN"])
138
+ username = user_info["name"]
139
+ repo_id = f"{username}/mnist-modal"
140
+
141
+ print(f"Uploading model to Hugging Face repo: {repo_id}")
142
+
143
+ try:
144
+ api.create_repo(repo_id=repo_id, repo_type="model", token=os.environ["HF_TOKEN"], exist_ok=True)
145
+ except Exception as e:
146
+ print(f"Repo might already exist or error: {e}")
147
+
148
+ api.upload_file(
149
+ path_or_fileobj=model_path,
150
+ path_in_repo="mnist_model.pth",
151
+ repo_id=repo_id,
152
+ repo_type="model",
153
+ token=os.environ["HF_TOKEN"]
154
+ )
155
+ print("Model uploaded successfully!")
156
+ except Exception as e:
157
+ print(f"Failed to upload to HF: {e}")
158
+
159
+ wandb.finish()
160
+
161
+ return {"accuracy": accuracy}
162
+
163
+ @app.local_entrypoint()
164
+ def main():
165
+ train.remote()
166
+