chipling
/

opium-mdlm

Model card Files Files and versions

xet

Community

chipling commited on 13 days ago

Commit

1c89b07

verified ·

1 Parent(s): 5f202d0

Upload main.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

main.ipynb +46 -14

main.ipynb CHANGED Viewed

@@ -521,7 +521,13 @@
     "        return self.final_norm(x)\n",
     "\n",
     "    def forward(self, z_t: torch.Tensor, t: torch.Tensor) -> torch.Tensor:\n",
-    "        \"\"\"Full forward pass returning logits [B, L, V].\"\"\"\n",
     "        hidden = self.forward_hidden(z_t, t)\n",
     "        logits = self.output_proj(hidden)\n",
     "        logits[:, :, self.config.mask_token_id] = -1e9\n",
@@ -600,7 +606,7 @@
     "            alpha_next = self.noise_schedule.alpha(t_next)\n",
     "\n",
     "            t_batch = torch.full((batch_size,), t_now.item(), device=device)\n",
-    "            logits = self.forward(x, t_batch)\n",
     "            probs = F.softmax(logits / temperature, dim=-1)\n",
     "\n",
     "            unmask_prob = ((alpha_next - alpha_now) / (1.0 - alpha_now + 1e-8)).clamp(0, 1)\n",
@@ -616,7 +622,7 @@
     "        is_masked = (x == self.config.mask_token_id)\n",
     "        if is_masked.any():\n",
     "            t_batch = torch.full((batch_size,), 1e-5, device=device)\n",
-    "            logits = self.forward(x, t_batch)\n",
     "            probs = F.softmax(logits / temperature, dim=-1)\n",
     "            flat_probs = probs.reshape(-1, self.config.vocab_size)\n",
     "            sampled = torch.multinomial(flat_probs, 1).reshape(batch_size, seq_len)\n",
@@ -633,12 +639,12 @@
     "print(f\"Unique parameters (weight tying): {unique_params / 1e6:.1f}M\")\n",
     "\n",
     "# Multi-GPU support (Kaggle T4 x2)\n",
     "if torch.cuda.device_count() > 1:\n",
     "    print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
-    "    model = nn.DataParallel(model)\n",
-    "    model_unwrapped = model.module\n",
     "else:\n",
-    "    model_unwrapped = model\n",
     "\n",
     "# Quick memory test\n",
     "with torch.no_grad():\n",
@@ -952,17 +958,17 @@
     "\n",
     "\n",
     "@torch.no_grad()\n",
-    "def generate_samples(model_unwrapped, tokenizer, num_samples=4, seq_len=128, temperature=0.8):\n",
     "    \"\"\"Generate and print text samples.\"\"\"\n",
-    "    model.eval()\n",
-    "    tokens = model.sample(num_samples, seq_len, temperature=temperature)\n",
     "    texts = []\n",
     "    for i in range(num_samples):\n",
     "        text = tokenizer.decode(tokens[i].cpu().tolist(), skip_special_tokens=True)\n",
     "        texts.append(text)\n",
     "        print(f\"\\n--- Sample {i+1} ---\")\n",
     "        print(text[:500])\n",
-    "    model.train()\n",
     "    return texts\n",
     "\n",
     "\n",
@@ -1867,12 +1873,38 @@
     "        tokens_processed += batch.numel()\n",
     "\n",
     "        with autocast('cuda', dtype=torch.float16):\n",
-    "            result = model_unwrapped.compute_loss(batch)\n",
-    "            loss = result['loss'] / config.grad_accum_steps\n",
     "\n",
     "        scaler.scale(loss).backward()\n",
-    "        step_loss += result['loss'].item() / config.grad_accum_steps\n",
-    "        step_acc += result['accuracy'].item() / config.grad_accum_steps\n",
     "\n",
     "    # Gradient clipping and optimizer step\n",
     "    scaler.unscale_(optimizer)\n",

     "        return self.final_norm(x)\n",
     "\n",
     "    def forward(self, z_t: torch.Tensor, t: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"Forward pass returning hidden states [B, L, D].\n",
+    "        Used by DataParallel \u2014 logit projection done outside for memory efficiency.\n",
+    "        For full logits (sampling), use forward_full().\"\"\"\n",
+    "        return self.forward_hidden(z_t, t)\n",
+    "\n",
+    "    def forward_full(self, z_t: torch.Tensor, t: torch.Tensor) -> torch.Tensor:\n",
+    "        \"\"\"Full forward pass returning logits [B, L, V]. Used for sampling.\"\"\"\n",
     "        hidden = self.forward_hidden(z_t, t)\n",
     "        logits = self.output_proj(hidden)\n",
     "        logits[:, :, self.config.mask_token_id] = -1e9\n",
     "            alpha_next = self.noise_schedule.alpha(t_next)\n",
     "\n",
     "            t_batch = torch.full((batch_size,), t_now.item(), device=device)\n",
+    "            logits = self.forward_full(x, t_batch)\n",
     "            probs = F.softmax(logits / temperature, dim=-1)\n",
     "\n",
     "            unmask_prob = ((alpha_next - alpha_now) / (1.0 - alpha_now + 1e-8)).clamp(0, 1)\n",
     "        is_masked = (x == self.config.mask_token_id)\n",
     "        if is_masked.any():\n",
     "            t_batch = torch.full((batch_size,), 1e-5, device=device)\n",
+    "            logits = self.forward_full(x, t_batch)\n",
     "            probs = F.softmax(logits / temperature, dim=-1)\n",
     "            flat_probs = probs.reshape(-1, self.config.vocab_size)\n",
     "            sampled = torch.multinomial(flat_probs, 1).reshape(batch_size, seq_len)\n",
     "print(f\"Unique parameters (weight tying): {unique_params / 1e6:.1f}M\")\n",
     "\n",
     "# Multi-GPU support (Kaggle T4 x2)\n",
+    "model_unwrapped = model\n",
     "if torch.cuda.device_count() > 1:\n",
     "    print(f\"\\nUsing {torch.cuda.device_count()} GPUs with DataParallel!\")\n",
+    "    model_dp = nn.DataParallel(model)\n",
     "else:\n",
+    "    model_dp = model\n",
     "\n",
     "# Quick memory test\n",
     "with torch.no_grad():\n",
     "\n",
     "\n",
     "@torch.no_grad()\n",
+    "def generate_samples(mdl, tokenizer, num_samples=4, seq_len=128, temperature=0.8):\n",
     "    \"\"\"Generate and print text samples.\"\"\"\n",
+    "    mdl.eval()\n",
+    "    tokens = mdl.sample(num_samples, seq_len, temperature=temperature)\n",
     "    texts = []\n",
     "    for i in range(num_samples):\n",
     "        text = tokenizer.decode(tokens[i].cpu().tolist(), skip_special_tokens=True)\n",
     "        texts.append(text)\n",
     "        print(f\"\\n--- Sample {i+1} ---\")\n",
     "        print(text[:500])\n",
+    "    mdl.train()\n",
     "    return texts\n",
     "\n",
     "\n",
     "        tokens_processed += batch.numel()\n",
     "\n",
     "        with autocast('cuda', dtype=torch.float16):\n",
+    "            # Noise + mask on this batch\n",
+    "            B, L = batch.shape\n",
+    "            t = model_unwrapped.noise_schedule.sample_t(B, batch.device)\n",
+    "            z_t, mask = model_unwrapped.noise_schedule.forward_process(batch, t, config.mask_token_id)\n",
+    "\n",
+    "            # Forward pass through DataParallel (this splits across GPUs)\n",
+    "            hidden = model_dp(z_t, t)  # [B, L, D] \u2014 uses forward_hidden via DataParallel\n",
+    "\n",
+    "            # Loss computation (cheap, single GPU is fine)\n",
+    "            masked_hidden = hidden[mask]\n",
+    "            masked_targets = batch[mask]\n",
+    "\n",
+    "            if masked_hidden.shape[0] > 0:\n",
+    "                masked_logits = F.linear(masked_hidden, model_unwrapped.output_proj.weight)\n",
+    "                masked_logits[:, config.mask_token_id] = -1e9\n",
+    "                ce_loss = F.cross_entropy(masked_logits, masked_targets, reduction='none')\n",
+    "                weight = model_unwrapped.noise_schedule.loss_weight(t)\n",
+    "                weight_expanded = weight[:, None].expand(B, L)[mask]\n",
+    "                result_loss = (ce_loss * weight_expanded).mean()\n",
+    "\n",
+    "                with torch.no_grad():\n",
+    "                    preds = masked_logits.argmax(dim=-1)\n",
+    "                    result_acc = (preds == masked_targets).float().mean().item()\n",
+    "            else:\n",
+    "                result_loss = torch.tensor(0.0, device=batch.device)\n",
+    "                result_acc = 1.0\n",
+    "\n",
+    "            loss = result_loss / config.grad_accum_steps\n",
     "\n",
     "        scaler.scale(loss).backward()\n",
+    "        step_loss += result_loss.item() / config.grad_accum_steps\n",
+    "        step_acc += result_acc / config.grad_accum_steps\n",
     "\n",
     "    # Gradient clipping and optimizer step\n",
     "    scaler.unscale_(optimizer)\n",