DatamadeA1 commited on
Commit
7abebba
·
verified ·
1 Parent(s): 6c2ca48

Upload docker-compose.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. docker-compose.yml +150 -27
docker-compose.yml CHANGED
@@ -4,43 +4,166 @@ services:
4
  azimuth-training:
5
  image: nvcr.io/nvidia/pytorch:24.01-py3
6
  container_name: azimuth-training
7
- runtime: nvidia
8
  environment:
9
  - NVIDIA_VISIBLE_DEVICES=all
10
  volumes:
11
  - /workspace:/workspace
12
  working_dir: /workspace
13
  command: |
14
- bash -c "
15
- echo '============================================================'
16
- echo ' AZIMUTH CONVERSATIONAL TRAINING'
17
- echo '============================================================'
 
18
 
19
- apt-get update && apt-get install -y git
 
20
 
21
- git clone https://github.com/datamade-autobot/azimuth.git /workspace/azimuth || (cd /workspace/azimuth && git pull)
22
- cd /workspace/azimuth
23
-
24
- pip install datasets transformers einops tqdm
25
-
26
- mkdir -p /workspace/data/conversational_corpus /workspace/checkpoints
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- python scripts/download_conversational_hf_data.py --output-dir /workspace/data/conversational_corpus --all
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- python scripts/train_azimuth_conversational.py \
31
- --steps 100000 \
32
- --batch-size 64 \
33
- --seq-len 1024 \
34
- --d-model 1024 \
35
- --n-layers 24 \
36
- --n-heads 16 \
37
- --lr 2e-4 \
38
- --device cuda \
39
- --eval-every 2000 \
40
- --checkpoint-every 5000 \
41
- --data-path /workspace/data/conversational_corpus/azimuth_conversational_binary.pt \
42
- 2>&1 | tee /workspace/checkpoints/training.log
43
- "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  deploy:
45
  resources:
46
  reservations:
 
4
  azimuth-training:
5
  image: nvcr.io/nvidia/pytorch:24.01-py3
6
  container_name: azimuth-training
 
7
  environment:
8
  - NVIDIA_VISIBLE_DEVICES=all
9
  volumes:
10
  - /workspace:/workspace
11
  working_dir: /workspace
12
  command: |
13
+ bash -c '
14
+ echo "============================================================"
15
+ echo " AZIMUTH CONVERSATIONAL TRAINING"
16
+ echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
17
+ echo "============================================================"
18
 
19
+ pip install datasets transformers einops tqdm torch
20
+ mkdir -p /workspace/data /workspace/checkpoints
21
 
22
+ # Download and convert conversational data from HuggingFace
23
+ python -c "
24
+ import torch
25
+ from datasets import load_dataset
26
+ from pathlib import Path
27
+
28
+ print(\"Downloading conversational datasets from HuggingFace...\")
29
+ samples = []
30
+
31
+ # OpenAssistant
32
+ print(\" Loading OpenAssistant/oasst1...\")
33
+ ds = load_dataset(\"OpenAssistant/oasst1\", split=\"train\")
34
+ for s in list(ds)[:20000]:
35
+ text = s.get(\"text\", \"\")
36
+ if len(text) > 50:
37
+ b = list(text.encode(\"utf-8\"))
38
+ if len(b) > 10:
39
+ samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})
40
+
41
+ # Alpaca
42
+ print(\" Loading tatsu-lab/alpaca...\")
43
+ ds = load_dataset(\"tatsu-lab/alpaca\", split=\"train\")
44
+ for s in list(ds)[:30000]:
45
+ text = f\"User: {s.get(\"instruction\", \"\")}\\nAssistant: {s.get(\"output\", \"\")}\"
46
+ if len(text) > 50:
47
+ b = list(text.encode(\"utf-8\"))
48
+ samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})
49
+
50
+ print(f\"Total samples: {len(samples)}\")
51
+ torch.save(samples, \"/workspace/data/train.pt\")
52
+ print(\"Saved to /workspace/data/train.pt\")
53
+ "
54
 
55
+ # Training script
56
+ python -c "
57
+ import torch
58
+ import torch.nn as nn
59
+ import torch.nn.functional as F
60
+ import random
61
+ import time
62
+
63
+ print(\"============================================================\")
64
+ print(\" TRAINING AZIMUTH (Binary-Native Transformer)\")
65
+ print(\"============================================================\")
66
+
67
+ class AzimuthModel(nn.Module):
68
+ def __init__(self, d_model=1024, n_layers=24, n_heads=16, max_seq=1024):
69
+ super().__init__()
70
+ self.emb = nn.Embedding(256, d_model)
71
+ self.pos = nn.Embedding(max_seq, d_model)
72
+ layer = nn.TransformerEncoderLayer(d_model, n_heads, d_model*4, dropout=0.1, batch_first=True, norm_first=True)
73
+ self.transformer = nn.TransformerEncoder(layer, n_layers)
74
+ self.head = nn.Linear(d_model, 256)
75
+ self.d_model = d_model
76
 
77
+ def forward(self, x):
78
+ B, T = x.shape
79
+ pos = torch.arange(T, device=x.device)
80
+ h = self.emb(x) + self.pos(pos)
81
+ mask = nn.Transformer.generate_square_subsequent_mask(T, device=x.device)
82
+ h = self.transformer(h, mask=mask, is_causal=True)
83
+ return self.head(h)
84
+
85
+ # Load data
86
+ data = torch.load(\"/workspace/data/train.pt\")
87
+ print(f\"Loaded {len(data)} samples\")
88
+
89
+ # Create model
90
+ model = AzimuthModel(d_model=1024, n_layers=24, n_heads=16).cuda()
91
+ params = sum(p.numel() for p in model.parameters())
92
+ print(f\"Model: {params:,} parameters\")
93
+
94
+ opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
95
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=100000)
96
+
97
+ # Training loop
98
+ STEPS = 100000
99
+ BATCH = 8
100
+ SEQ_LEN = 512
101
+
102
+ print(f\"Training for {STEPS} steps...\")
103
+ print(\"-\" * 60)
104
+
105
+ start = time.time()
106
+ for step in range(STEPS):
107
+ # Get batch
108
+ batch_x, batch_y = [], []
109
+ for _ in range(BATCH):
110
+ s = random.choice(data)
111
+ x = s[\"input_ids\"][:SEQ_LEN]
112
+ y = s[\"labels\"][:SEQ_LEN]
113
+ if len(x) < SEQ_LEN:
114
+ x = F.pad(x, (0, SEQ_LEN - len(x)))
115
+ y = F.pad(y, (0, SEQ_LEN - len(y)))
116
+ batch_x.append(x)
117
+ batch_y.append(y)
118
+
119
+ x = torch.stack(batch_x).cuda()
120
+ y = torch.stack(batch_y).cuda()
121
+
122
+ # Forward
123
+ logits = model(x)
124
+ loss = F.cross_entropy(logits.view(-1, 256), y.view(-1), ignore_index=0)
125
+
126
+ # Backward
127
+ opt.zero_grad()
128
+ loss.backward()
129
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
130
+ opt.step()
131
+ scheduler.step()
132
+
133
+ # Log
134
+ if step % 100 == 0:
135
+ elapsed = time.time() - start
136
+ eta = elapsed / (step + 1) * (STEPS - step) / 60
137
+ print(f\"Step {step:6d}/{STEPS} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.2e} | ETA: {eta:.0f}m\")
138
+
139
+ # Checkpoint
140
+ if step > 0 and step % 5000 == 0:
141
+ torch.save({\"step\": step, \"model\": model.state_dict()}, f\"/workspace/checkpoints/step_{step}.pt\")
142
+ print(f\" Saved checkpoint: step_{step}.pt\")
143
+
144
+ # Generation sample
145
+ if step > 0 and step % 2000 == 0:
146
+ model.eval()
147
+ prompt = \"User: Hello!\\nAssistant:\"
148
+ x = torch.tensor([list(prompt.encode())], device=\"cuda\")
149
+ with torch.no_grad():
150
+ for _ in range(50):
151
+ logits = model(x[:, -512:])
152
+ probs = F.softmax(logits[0, -1] / 0.8, dim=-1)
153
+ next_byte = torch.multinomial(probs, 1)
154
+ x = torch.cat([x, next_byte.unsqueeze(0)], dim=1)
155
+ if next_byte.item() == ord(\"\\n\"): break
156
+ response = bytes(x[0].tolist()).decode(\"utf-8\", errors=\"replace\")
157
+ print(f\" Sample: {response[len(prompt):80]}...\")
158
+ model.train()
159
+
160
+ # Save final
161
+ torch.save({\"step\": STEPS, \"model\": model.state_dict()}, \"/workspace/checkpoints/final.pt\")
162
+ print(\"\\n\" + \"=\" * 60)
163
+ print(\"TRAINING COMPLETE!\")
164
+ print(f\"Final checkpoint: /workspace/checkpoints/final.pt\")
165
+ "
166
+ '
167
  deploy:
168
  resources:
169
  reservations: