DatamadeA1
/

azimuth-training

Model card Files Files and versions

xet

Community

DatamadeA1 commited on Jan 30

Commit

7abebba

verified ·

1 Parent(s): 6c2ca48

Upload docker-compose.yml with huggingface_hub

Browse files

Files changed (1) hide show

docker-compose.yml +150 -27

docker-compose.yml CHANGED Viewed

@@ -4,43 +4,166 @@ services:
   azimuth-training:
     image: nvcr.io/nvidia/pytorch:24.01-py3
     container_name: azimuth-training
-    runtime: nvidia
     environment:
       - NVIDIA_VISIBLE_DEVICES=all
     volumes:
       - /workspace:/workspace
     working_dir: /workspace
     command: |
-      bash -c "
-        echo '============================================================'
-        echo '  AZIMUTH CONVERSATIONAL TRAINING'
-        echo '============================================================'
-        apt-get update && apt-get install -y git
-        git clone https://github.com/datamade-autobot/azimuth.git /workspace/azimuth || (cd /workspace/azimuth && git pull)
-        cd /workspace/azimuth
-        pip install datasets transformers einops tqdm
-        mkdir -p /workspace/data/conversational_corpus /workspace/checkpoints
-        python scripts/download_conversational_hf_data.py --output-dir /workspace/data/conversational_corpus --all
-        python scripts/train_azimuth_conversational.py \
-          --steps 100000 \
-          --batch-size 64 \
-          --seq-len 1024 \
-          --d-model 1024 \
-          --n-layers 24 \
-          --n-heads 16 \
-          --lr 2e-4 \
-          --device cuda \
-          --eval-every 2000 \
-          --checkpoint-every 5000 \
-          --data-path /workspace/data/conversational_corpus/azimuth_conversational_binary.pt \
-          2>&1 | tee /workspace/checkpoints/training.log
-      "
     deploy:
       resources:
         reservations:

   azimuth-training:
     image: nvcr.io/nvidia/pytorch:24.01-py3
     container_name: azimuth-training
     environment:
       - NVIDIA_VISIBLE_DEVICES=all
     volumes:
       - /workspace:/workspace
     working_dir: /workspace
     command: |
+      bash -c '
+        echo "============================================================"
+        echo "  AZIMUTH CONVERSATIONAL TRAINING"
+        echo "  GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
+        echo "============================================================"
+        pip install datasets transformers einops tqdm torch
+        mkdir -p /workspace/data /workspace/checkpoints
+        # Download and convert conversational data from HuggingFace
+        python -c "
+import torch
+from datasets import load_dataset
+from pathlib import Path
+print(\"Downloading conversational datasets from HuggingFace...\")
+samples = []
+# OpenAssistant
+print(\"  Loading OpenAssistant/oasst1...\")
+ds = load_dataset(\"OpenAssistant/oasst1\", split=\"train\")
+for s in list(ds)[:20000]:
+    text = s.get(\"text\", \"\")
+    if len(text) > 50:
+        b = list(text.encode(\"utf-8\"))
+        if len(b) > 10:
+            samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})
+# Alpaca
+print(\"  Loading tatsu-lab/alpaca...\")
+ds = load_dataset(\"tatsu-lab/alpaca\", split=\"train\")
+for s in list(ds)[:30000]:
+    text = f\"User: {s.get(\"instruction\", \"\")}\\nAssistant: {s.get(\"output\", \"\")}\"
+    if len(text) > 50:
+        b = list(text.encode(\"utf-8\"))
+        samples.append({\"input_ids\": torch.tensor(b[:-1], dtype=torch.long), \"labels\": torch.tensor(b[1:], dtype=torch.long)})
+print(f\"Total samples: {len(samples)}\")
+torch.save(samples, \"/workspace/data/train.pt\")
+print(\"Saved to /workspace/data/train.pt\")
+"
+        # Training script
+        python -c "
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import random
+import time
+print(\"============================================================\")
+print(\"  TRAINING AZIMUTH (Binary-Native Transformer)\")
+print(\"============================================================\")
+class AzimuthModel(nn.Module):
+    def __init__(self, d_model=1024, n_layers=24, n_heads=16, max_seq=1024):
+        super().__init__()
+        self.emb = nn.Embedding(256, d_model)
+        self.pos = nn.Embedding(max_seq, d_model)
+        layer = nn.TransformerEncoderLayer(d_model, n_heads, d_model*4, dropout=0.1, batch_first=True, norm_first=True)
+        self.transformer = nn.TransformerEncoder(layer, n_layers)
+        self.head = nn.Linear(d_model, 256)
+        self.d_model = d_model
+    def forward(self, x):
+        B, T = x.shape
+        pos = torch.arange(T, device=x.device)
+        h = self.emb(x) + self.pos(pos)
+        mask = nn.Transformer.generate_square_subsequent_mask(T, device=x.device)
+        h = self.transformer(h, mask=mask, is_causal=True)
+        return self.head(h)
+# Load data
+data = torch.load(\"/workspace/data/train.pt\")
+print(f\"Loaded {len(data)} samples\")
+# Create model
+model = AzimuthModel(d_model=1024, n_layers=24, n_heads=16).cuda()
+params = sum(p.numel() for p in model.parameters())
+print(f\"Model: {params:,} parameters\")
+opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=100000)
+# Training loop
+STEPS = 100000
+BATCH = 8
+SEQ_LEN = 512
+print(f\"Training for {STEPS} steps...\")
+print(\"-\" * 60)
+start = time.time()
+for step in range(STEPS):
+    # Get batch
+    batch_x, batch_y = [], []
+    for _ in range(BATCH):
+        s = random.choice(data)
+        x = s[\"input_ids\"][:SEQ_LEN]
+        y = s[\"labels\"][:SEQ_LEN]
+        if len(x) < SEQ_LEN:
+            x = F.pad(x, (0, SEQ_LEN - len(x)))
+            y = F.pad(y, (0, SEQ_LEN - len(y)))
+        batch_x.append(x)
+        batch_y.append(y)
+    x = torch.stack(batch_x).cuda()
+    y = torch.stack(batch_y).cuda()
+    # Forward
+    logits = model(x)
+    loss = F.cross_entropy(logits.view(-1, 256), y.view(-1), ignore_index=0)
+    # Backward
+    opt.zero_grad()
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+    opt.step()
+    scheduler.step()
+    # Log
+    if step % 100 == 0:
+        elapsed = time.time() - start
+        eta = elapsed / (step + 1) * (STEPS - step) / 60
+        print(f\"Step {step:6d}/{STEPS} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.2e} | ETA: {eta:.0f}m\")
+    # Checkpoint
+    if step > 0 and step % 5000 == 0:
+        torch.save({\"step\": step, \"model\": model.state_dict()}, f\"/workspace/checkpoints/step_{step}.pt\")
+        print(f\"  Saved checkpoint: step_{step}.pt\")
+    # Generation sample
+    if step > 0 and step % 2000 == 0:
+        model.eval()
+        prompt = \"User: Hello!\\nAssistant:\"
+        x = torch.tensor([list(prompt.encode())], device=\"cuda\")
+        with torch.no_grad():
+            for _ in range(50):
+                logits = model(x[:, -512:])
+                probs = F.softmax(logits[0, -1] / 0.8, dim=-1)
+                next_byte = torch.multinomial(probs, 1)
+                x = torch.cat([x, next_byte.unsqueeze(0)], dim=1)
+                if next_byte.item() == ord(\"\\n\"): break
+        response = bytes(x[0].tolist()).decode(\"utf-8\", errors=\"replace\")
+        print(f\"  Sample: {response[len(prompt):80]}...\")
+        model.train()
+# Save final
+torch.save({\"step\": STEPS, \"model\": model.state_dict()}, \"/workspace/checkpoints/final.pt\")
+print(\"\\n\" + \"=\" * 60)
+print(\"TRAINING COMPLETE!\")
+print(f\"Final checkpoint: /workspace/checkpoints/final.pt\")
+"
+      '
     deploy:
       resources:
         reservations: