Spaces:

Jangai
/

Antigravity

Paused

App Files Files Community

AdriBat1 commited on Jan 3

Commit

6ec2818

1 Parent(s): 938275a

Add DeepSeek-Lite Protocol: 50M params, FineWeb-Edu, TikToken, BFloat16

Browse files

Files changed (4) hide show

remote-gpu-client/examples/deepseek_lite.py +377 -0
remote-gpu-client/examples/inference_tower.py +189 -0
remote-gpu-client/run_deepseek_lite.py +47 -0
remote-gpu-server/requirements.txt +3 -3

remote-gpu-client/examples/deepseek_lite.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""
+DeepSeek-Lite Protocol: Production LLM Comparison
+Baseline (GPT-2 style) vs mHC (DeepSeek-V3 style)
+Dataset: FineWeb-Edu | Tokenizer: TikToken | ~50M params each
+"""
+import sys
+import traceback
+import os
+import time
+import math
+print("🔬 DeepSeek-Lite Protocol: Production LLM Training")
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+    import tiktoken
+    from datasets import load_dataset
+    import matplotlib.pyplot as plt
+    print("🔹 Imports successful")
+    # === CONFIGURATION ===
+    EXPERIMENT_NAME = "deepseek_lite_v1"
+    # Model Architecture
+    d_model = 384       # Model dimension
+    n_heads = 6         # Attention heads (64 dim per head)
+    n_layers = 30       # Deep & Narrow
+    context_len = 1024  # Full context window
+    vocab_size = 50257  # GPT-2 vocab
+    dropout = 0.1
+    # Training
+    batch_size = 4            # Per-step batch (small for memory)
+    grad_accum_steps = 32     # Effective batch = 4 * 32 = 128
+    max_lr = 6e-4
+    min_lr = max_lr * 0.1
+    warmup_steps = 200
+    total_steps = 5000
+    eval_interval = 100
+    # Hardware
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+    print(f"🔹 Device: {device}, Dtype: {dtype}")
+    # Storage
+    storage_dir = f"/home/user/app/storage/{EXPERIMENT_NAME}"
+    os.makedirs(storage_dir, exist_ok=True)
+    # Chunk config (resumable)
+    CHUNK_STEPS = 25
+    # === TOKENIZER ===
+    print("📚 Loading TikToken (GPT-4 encoding)...")
+    enc = tiktoken.get_encoding("cl100k_base")  # GPT-4 compatible
+    # === DATASET (Streaming) ===
+    print("📦 Loading FineWeb-Edu (streaming)...")
+    dataset_iter = iter(load_dataset(
+        "HuggingFaceFW/fineweb-edu",
+        "sample-10BT",
+        split="train",
+        streaming=True,
+        trust_remote_code=True
+    ))
+    # Token buffer for batching
+    token_buffer = []
+    def refill_buffer(min_tokens=100000):
+        global token_buffer
+        while len(token_buffer) < min_tokens:
+            try:
+                item = next(dataset_iter)
+                toks = enc.encode(item['text'])
+                token_buffer.extend(toks)
+            except StopIteration:
+                break
+    def get_batch():
+        global token_buffer
+        refill_buffer()
+        if len(token_buffer) < (context_len + 1) * batch_size:
+            raise RuntimeError("Not enough tokens in buffer")
+        ix = torch.randint(len(token_buffer) - context_len - 1, (batch_size,))
+        x = torch.stack([torch.tensor(token_buffer[i:i+context_len]) for i in ix])
+        y = torch.stack([torch.tensor(token_buffer[i+1:i+context_len+1]) for i in ix])
+        return x.to(device), y.to(device)
+    # === MODEL COMPONENTS ===
+    class RMSNorm(nn.Module):
+        def __init__(self, dim, eps=1e-6):
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+        def forward(self, x):
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+    class CausalSelfAttention(nn.Module):
+        def __init__(self, d_model, n_heads):
+            super().__init__()
+            self.n_heads = n_heads
+            self.head_dim = d_model // n_heads
+            self.c_attn = nn.Linear(d_model, 3 * d_model, bias=False)
+            self.c_proj = nn.Linear(d_model, d_model, bias=False)
+            self.dropout = nn.Dropout(dropout)
+            self.register_buffer("mask", torch.tril(torch.ones(context_len, context_len)))
+        def forward(self, x):
+            B, T, C = x.shape
+            qkv = self.c_attn(x).chunk(3, dim=-1)
+            q, k, v = [t.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for t in qkv]
+            att = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
+            att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.dropout(att)
+            y = att @ v
+            y = y.transpose(1, 2).contiguous().view(B, T, C)
+            return self.c_proj(y)
+    class MLP(nn.Module):
+        def __init__(self, d_model):
+            super().__init__()
+            self.c_fc = nn.Linear(d_model, 4 * d_model, bias=False)
+            self.gelu = nn.GELU()
+            self.c_proj = nn.Linear(4 * d_model, d_model, bias=False)
+            self.dropout = nn.Dropout(dropout)
+        def forward(self, x):
+            return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
+    # === BLOCK VARIANTS ===
+    class BlockBaseline(nn.Module):
+        """GPT-2 style: Pre-LayerNorm + simple residual"""
+        def __init__(self, d_model, n_heads):
+            super().__init__()
+            self.ln1 = nn.LayerNorm(d_model)
+            self.attn = CausalSelfAttention(d_model, n_heads)
+            self.ln2 = nn.LayerNorm(d_model)
+            self.mlp = MLP(d_model)
+        def forward(self, x):
+            x = x + self.attn(self.ln1(x))
+            x = x + self.mlp(self.ln2(x))
+            return x
+    class BlockMHC(nn.Module):
+        """DeepSeek-V3 style: RMSNorm + Manifold Hybrid Connection"""
+        def __init__(self, d_model, n_heads):
+            super().__init__()
+            self.attn = CausalSelfAttention(d_model, n_heads)
+            self.mlp = MLP(d_model)
+            self.ln1 = RMSNorm(d_model)
+            self.ln2 = RMSNorm(d_model)
+            # Learnable mixing coefficients
+            self.alpha1 = nn.Parameter(torch.tensor(0.9))
+            self.beta1 = nn.Parameter(torch.tensor(0.1))
+            self.alpha2 = nn.Parameter(torch.tensor(0.9))
+            self.beta2 = nn.Parameter(torch.tensor(0.1))
+        def forward(self, x):
+            # Attention with mHC
+            x = self.ln1(self.alpha1 * x + self.beta1 * self.attn(x))
+            # MLP with mHC
+            x = self.ln2(self.alpha2 * x + self.beta2 * self.mlp(x))
+            return x
+    # === GPT MODEL ===
+    class GPT(nn.Module):
+        def __init__(self, arch_type='baseline'):
+            super().__init__()
+            self.arch_type = arch_type
+            self.wte = nn.Embedding(vocab_size, d_model)
+            self.wpe = nn.Embedding(context_len, d_model)
+            Block = BlockBaseline if arch_type == 'baseline' else BlockMHC
+            self.blocks = nn.ModuleList([Block(d_model, n_heads) for _ in range(n_layers)])
+            self.ln_f = nn.LayerNorm(d_model) if arch_type == 'baseline' else RMSNorm(d_model)
+            self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
+            # Weight tying
+            self.wte.weight = self.lm_head.weight
+            # Count params
+            n_params = sum(p.numel() for p in self.parameters())
+            print(f"🔧 {arch_type.upper()}: {n_params/1e6:.2f}M params")
+        def forward(self, idx, targets=None):
+            B, T = idx.shape
+            tok_emb = self.wte(idx)
+            pos_emb = self.wpe(torch.arange(T, device=device))
+            x = tok_emb + pos_emb
+            for block in self.blocks:
+                x = block(x)
+            x = self.ln_f(x)
+            logits = self.lm_head(x)
+            loss = None
+            if targets is not None:
+                loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
+            return logits, loss
+    # === LR SCHEDULER ===
+    def get_lr(step):
+        if step < warmup_steps:
+            return max_lr * (step + 1) / warmup_steps
+        if step > total_steps:
+            return min_lr
+        decay_ratio = (step - warmup_steps) / (total_steps - warmup_steps)
+        return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * decay_ratio))
+    # === HISTORY ===
+    history = {
+        'steps': [],
+        'loss_a': [], 'loss_b': [],
+        'ppl_a': [], 'ppl_b': [],
+        'grad_a': [], 'grad_b': []
+    }
+    current_step = 0
+    history_path = os.path.join(storage_dir, 'history.pt')
+    ckpt_a_path = os.path.join(storage_dir, 'model_a.pt')
+    ckpt_b_path = os.path.join(storage_dir, 'model_b.pt')
+    if os.path.exists(history_path):
+        print(f"🔄 Resuming from {history_path}")
+        history = torch.load(history_path)
+        if history['steps']:
+            current_step = history['steps'][-1]
+        print(f"   Last step: {current_step}")
+    # === INIT MODELS ===
+    print("🏗️ Building models...")
+    model_a = GPT('baseline').to(device)
+    model_b = GPT('mhc').to(device)
+    opt_a = torch.optim.AdamW(model_a.parameters(), lr=max_lr, betas=(0.9, 0.95), weight_decay=0.1)
+    opt_b = torch.optim.AdamW(model_b.parameters(), lr=max_lr, betas=(0.9, 0.95), weight_decay=0.1)
+    # Load checkpoints
+    if os.path.exists(ckpt_a_path):
+        model_a.load_state_dict(torch.load(ckpt_a_path))
+        opt_a.load_state_dict(torch.load(os.path.join(storage_dir, 'opt_a.pt')))
+    if os.path.exists(ckpt_b_path):
+        model_b.load_state_dict(torch.load(ckpt_b_path))
+        opt_b.load_state_dict(torch.load(os.path.join(storage_dir, 'opt_b.pt')))
+    # === TRAINING CHUNK ===
+    print(f"🚀 Training: Steps {current_step} -> {current_step + CHUNK_STEPS} (Target: {total_steps})")
+    model_a.train()
+    model_b.train()
+    scaler = torch.cuda.amp.GradScaler()
+    start_time = time.time()
+    for step_offset in range(CHUNK_STEPS):
+        step = current_step + step_offset + 1
+        if step > total_steps:
+            break
+        # Update LR
+        lr = get_lr(step)
+        for opt in [opt_a, opt_b]:
+            for pg in opt.param_groups:
+                pg['lr'] = lr
+        # Gradient Accumulation
+        loss_accum_a = 0.0
+        loss_accum_b = 0.0
+        opt_a.zero_grad()
+        opt_b.zero_grad()
+        for micro_step in range(grad_accum_steps):
+            x, y = get_batch()
+            with torch.cuda.amp.autocast(dtype=dtype):
+                _, loss_a = model_a(x, y)
+                _, loss_b = model_b(x, y)
+                loss_a = loss_a / grad_accum_steps
+                loss_b = loss_b / grad_accum_steps
+            loss_accum_a += loss_a.item()
+            loss_accum_b += loss_b.item()
+            scaler.scale(loss_a).backward()
+            scaler.scale(loss_b).backward()
+        # Grad clip
+        scaler.unscale_(opt_a)
+        scaler.unscale_(opt_b)
+        grad_norm_a = torch.nn.utils.clip_grad_norm_(model_a.parameters(), 1.0)
+        grad_norm_b = torch.nn.utils.clip_grad_norm_(model_b.parameters(), 1.0)
+        scaler.step(opt_a)
+        scaler.step(opt_b)
+        scaler.update()
+        # Log
+        if step % eval_interval == 0 or step == 1:
+            ppl_a = math.exp(loss_accum_a)
+            ppl_b = math.exp(loss_accum_b)
+            print(f"Step {step}: LossA={loss_accum_a:.4f} LossB={loss_accum_b:.4f} | "
+                  f"PPL_A={ppl_a:.2f} PPL_B={ppl_b:.2f} | "
+                  f"GradA={grad_norm_a:.2f} GradB={grad_norm_b:.2f} | LR={lr:.2e}")
+            history['steps'].append(step)
+            history['loss_a'].append(loss_accum_a)
+            history['loss_b'].append(loss_accum_b)
+            history['ppl_a'].append(ppl_a)
+            history['ppl_b'].append(ppl_b)
+            history['grad_a'].append(grad_norm_a.item())
+            history['grad_b'].append(grad_norm_b.item())
+            torch.save(history, history_path)
+            # Dashboard
+            fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+            ax1 = axes[0]
+            ax1.plot(history['steps'], history['loss_a'], label='Baseline', marker='o')
+            ax1.plot(history['steps'], history['loss_b'], label='mHC', marker='x')
+            ax1.set_xlabel('Steps')
+            ax1.set_ylabel('Loss')
+            ax1.set_title('Training Loss')
+            ax1.legend()
+            ax1.grid(True)
+            ax2 = axes[1]
+            ax2.plot(history['steps'], history['ppl_a'], label='Baseline', marker='o')
+            ax2.plot(history['steps'], history['ppl_b'], label='mHC', marker='x')
+            ax2.set_xlabel('Steps')
+            ax2.set_ylabel('Perplexity')
+            ax2.set_title('Perplexity')
+            ax2.legend()
+            ax2.grid(True)
+            ax3 = axes[2]
+            ax3.plot(history['steps'], history['grad_a'], label='Baseline', color='red')
+            ax3.plot(history['steps'], history['grad_b'], label='mHC', color='green')
+            ax3.set_xlabel('Steps')
+            ax3.set_ylabel('Gradient Norm')
+            ax3.set_title('Gradient Health')
+            ax3.legend()
+            ax3.grid(True)
+            plt.tight_layout()
+            plt.savefig(os.path.join(storage_dir, 'dashboard.png'))
+            plt.close()
+    elapsed = time.time() - start_time
+    print(f"🏁 Chunk done in {elapsed:.2f}s. Step: {step}")
+    # Save checkpoints
+    torch.save(model_a.state_dict(), ckpt_a_path)
+    torch.save(opt_a.state_dict(), os.path.join(storage_dir, 'opt_a.pt'))
+    torch.save(model_b.state_dict(), ckpt_b_path)
+    torch.save(opt_b.state_dict(), os.path.join(storage_dir, 'opt_b.pt'))
+    torch.save(history, history_path)
+    print("💾 Saved checkpoints.")
+    if step < total_steps:
+        print("CONTINUE_TRAINING")
+    else:
+        print("TRAINING_COMPLETE")
+        os.system(f"cp {os.path.join(storage_dir, 'dashboard.png')} .")
+        print("✅ Dashboard ready for download.")
+except Exception as e:
+    print(f"\n❌ FATAL ERROR: {e}")
+    traceback.print_exc()

remote-gpu-client/examples/inference_tower.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import sys
+import traceback
+import os
+print("🔮 Tower of Babel Inference (120-Layer Models)")
+try:
+    import torch
+    import torch.nn as nn
+    from torch.nn import functional as F
+    import requests
+    # --- Config (must match training) ---
+    block_size = 256
+    n_embd = 128
+    n_head = 4
+    n_layer = 120  # Tower config!
+    dropout = 0.1
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    storage_dir = "/home/user/app/storage/tower_120L"
+    ckpt_path_a = os.path.join(storage_dir, 'ckpt_a.pt')
+    ckpt_path_b = os.path.join(storage_dir, 'ckpt_b.pt')
+    # Vocab
+    url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+    data = requests.get(url).text
+    chars = sorted(list(set(data)))
+    vocab_size = len(chars)
+    stoi = { ch:i for i,ch in enumerate(chars) }
+    itos = { i:ch for i,ch in enumerate(chars) }
+    encode = lambda s: [stoi.get(c, 0) for c in s]
+    decode = lambda l: ''.join([itos[i] for i in l])
+    # --- Model Classes ---
+    class Head(nn.Module):
+        def __init__(self, head_size):
+            super().__init__()
+            self.key = nn.Linear(n_embd, head_size, bias=False)
+            self.query = nn.Linear(n_embd, head_size, bias=False)
+            self.value = nn.Linear(n_embd, head_size, bias=False)
+            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
+            self.dropout = nn.Dropout(dropout)
+        def forward(self, x):
+            B,T,C = x.shape
+            k = self.key(x)
+            q = self.query(x)
+            wei = q @ k.transpose(-2, -1) * C**-0.5
+            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
+            wei = F.softmax(wei, dim=-1)
+            wei = self.dropout(wei)
+            v = self.value(x)
+            return wei @ v
+    class MultiHeadAttention(nn.Module):
+        def __init__(self, num_heads, head_size):
+            super().__init__()
+            self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
+            self.proj = nn.Linear(n_embd, n_embd)
+            self.dropout = nn.Dropout(dropout)
+        def forward(self, x):
+            out = torch.cat([h(x) for h in self.heads], dim=-1)
+            return self.dropout(self.proj(out))
+    class FeedForward(nn.Module):
+        def __init__(self, n_embd):
+            super().__init__()
+            self.net = nn.Sequential(
+                nn.Linear(n_embd, 4 * n_embd),
+                nn.ReLU(),
+                nn.Linear(4 * n_embd, n_embd),
+                nn.Dropout(dropout),
+            )
+        def forward(self, x):
+            return self.net(x)
+    class BlockStandard(nn.Module):
+        def __init__(self, n_embd, n_head):
+            super().__init__()
+            head_size = n_embd // n_head
+            self.sa = MultiHeadAttention(n_head, head_size)
+            self.ffwd = FeedForward(n_embd)
+            self.ln1 = nn.LayerNorm(n_embd)
+            self.ln2 = nn.LayerNorm(n_embd)
+        def forward(self, x):
+            x = x + self.sa(self.ln1(x))
+            x = x + self.ffwd(self.ln2(x))
+            return x
+    class RMSNorm(nn.Module):
+        def __init__(self, dim, eps=1e-6):
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+        def _norm(self, x):
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        def forward(self, x):
+            return self._norm(x.float()).type_as(x) * self.weight
+    class BlockMHC(nn.Module):
+        def __init__(self, n_embd, n_head):
+            super().__init__()
+            head_size = n_embd // n_head
+            self.sa = MultiHeadAttention(n_head, head_size)
+            self.ffwd = FeedForward(n_embd)
+            self.alpha1 = nn.Parameter(torch.tensor(0.9))
+            self.beta1 = nn.Parameter(torch.tensor(0.1))
+            self.ln1 = RMSNorm(n_embd)
+            self.alpha2 = nn.Parameter(torch.tensor(0.9))
+            self.beta2 = nn.Parameter(torch.tensor(0.1))
+            self.ln2 = RMSNorm(n_embd)
+        def forward(self, x):
+            mix1 = self.alpha1 * x + self.beta1 * self.sa(x)
+            x = self.ln1(mix1)
+            mix2 = self.alpha2 * x + self.beta2 * self.ffwd(x)
+            x = self.ln2(mix2)
+            return x
+    class GPT(nn.Module):
+        def __init__(self, arch_type='standard'):
+            super().__init__()
+            self.arch_type = arch_type
+            self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
+            self.position_embedding_table = nn.Embedding(block_size, n_embd)
+            if arch_type == 'standard':
+                self.blocks = nn.Sequential(*[BlockStandard(n_embd, n_head) for _ in range(n_layer)])
+                self.ln_f = nn.LayerNorm(n_embd)
+            elif arch_type == 'mhc':
+                self.blocks = nn.Sequential(*[BlockMHC(n_embd, n_head) for _ in range(n_layer)])
+                self.ln_f = RMSNorm(n_embd)
+            self.lm_head = nn.Linear(n_embd, vocab_size)
+        def forward(self, idx, targets=None):
+            B, T = idx.shape
+            tok_emb = self.token_embedding_table(idx)
+            pos_emb = self.position_embedding_table(torch.arange(T, device=device))
+            x = tok_emb + pos_emb
+            x = self.blocks(x)
+            x = self.ln_f(x)
+            logits = self.lm_head(x)
+            return logits, None
+        def generate(self, idx, max_new_tokens):
+            for _ in range(max_new_tokens):
+                idx_cond = idx[:, -block_size:]
+                logits, _ = self(idx_cond)
+                logits = logits[:, -1, :]
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+                idx = torch.cat((idx, idx_next), dim=1)
+            return idx
+    # --- Load Models ---
+    print(f"📦 Loading Model A (Standard, 120L)...")
+    model_a = GPT(arch_type='standard').to(device)
+    model_a.load_state_dict(torch.load(ckpt_path_a, map_location=device))
+    model_a.eval()
+    print(f"📦 Loading Model B (mHC, 120L)...")
+    model_b = GPT(arch_type='mhc').to(device)
+    model_b.load_state_dict(torch.load(ckpt_path_b, map_location=device))
+    model_b.eval()
+    # --- Inference ---
+    PROMPT = "ROMEO:"
+    MAX_TOKENS = 400
+    print(f"\n🎭 Prompt: '{PROMPT}'")
+    print(f"🔢 Max Tokens: {MAX_TOKENS}")
+    context = torch.tensor([encode(PROMPT)], dtype=torch.long, device=device)
+    print("\n" + "="*60)
+    print("MODEL A (Standard GPT, 120 Layers)")
+    print("="*60)
+    with torch.no_grad():
+        out_a = model_a.generate(context.clone(), max_new_tokens=MAX_TOKENS)
+    print(decode(out_a[0].tolist()))
+    print("\n" + "="*60)
+    print("MODEL B (mHC GPT, 120 Layers)")
+    print("="*60)
+    with torch.no_grad():
+        out_b = model_b.generate(context.clone(), max_new_tokens=MAX_TOKENS)
+    print(decode(out_b[0].tolist()))
+    print("\n✅ Inference Complete.")
+except Exception as e:
+    print(f"\n❌ FATAL ERROR: {e}")
+    traceback.print_exc()

remote-gpu-client/run_deepseek_lite.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import sys
+import time
+from antigravity_sdk.client import RemoteGPU
+SCRIPT_PATH = "examples/deepseek_lite.py"
+MAX_LOOPS = 250  # 5000 steps / 25 per chunk = 200 loops + buffer
+def main():
+    if not os.path.exists(SCRIPT_PATH):
+        print(f"❌ Script not found: {SCRIPT_PATH}")
+        sys.exit(1)
+    with open(SCRIPT_PATH, 'r') as f:
+        code = f.read()
+    print("🔬 DeepSeek-Lite Protocol: Production LLM Training")
+    print("📊 50M params | 30 Layers | FineWeb-Edu | TikToken")
+    print("-" * 50)
+    gpu = RemoteGPU()
+    for i in range(MAX_LOOPS):
+        print(f"\n🌀 Loop {i+1}/{MAX_LOOPS}...")
+        result = gpu.run(code, download_files=True, verbose=True)
+        output = result.output
+        if "TRAINING_COMPLETE" in output:
+            print("\n✅ Training Finished!")
+            break
+        elif "CONTINUE_TRAINING" in output:
+            print("⏳ Chunk complete. Resuming...")
+            time.sleep(2)
+        elif "FATAL" in output:
+            print("❌ Fatal Error. Stopping.")
+            break
+        else:
+            print("⚠️ Unknown status. Stopping safely.")
+            print(f"Last output: {output[-500:]}")
+            break
+    if os.path.exists("dashboard.png"):
+        print("\n📊 Success! Saved dashboard.png")
+if __name__ == "__main__":
+    main()

remote-gpu-server/requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
-# Force Rebuild 2
 fastapi
 uvicorn
 python-multipart
 gradio
@@ -11,4 +10,5 @@ matplotlib
 seaborn
 scikit-learn
 pandas

+# Force Rebuild 3 - DeepSeek-Lite
 fastapi
 uvicorn
 python-multipart
 gradio
 seaborn
 scikit-learn
 pandas
+tiktoken
+datasets