KAHABKALU
/

KahabMiniGenT2Im

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6b7a883f-d686-4cd8-b625-7633d078f373",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n",
+      "Loading VAE...\n",
+      "Loading tokenizer and text encoder...\n",
+      "Loading trained UNet...\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Enter your text prompt (e.g., 'A friendly dragon'):  A childran in Cyberpunk\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🎨 Generating 256x256 images...\n",
+      "Generating: A childran in Cyberpunk\n",
+      "Text embeddings shape: torch.Size([1, 77, 768]), device: cuda:0\n",
+      "Initial latents shape: torch.Size([1, 4, 32, 32]), device: cuda:0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Denoising A childran in Cyberpunk: 100%|███████████████████████████████████████████████| 50/50 [00:06<00:00,  7.63it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Final latents shape: torch.Size([1, 4, 32, 32])\n",
+      "✅ Saved: output/generated_256_1_A_childran_in_Cyberpunk.png\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import random\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import torchvision.transforms as T\n",
+    "from PIL import Image\n",
+    "import os\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline\n",
+    "from transformers import CLIPTokenizer, CLIPTextModel\n",
+    "def seed_everything(seed=42):\n",
+    "    torch.manual_seed(seed)\n",
+    "    torch.cuda.manual_seed(seed)\n",
+    "    torch.cuda.manual_seed_all(seed)\n",
+    "    random.seed(seed)\n",
+    "    np.random.seed(seed)\n",
+    "    torch.backends.cudnn.deterministic = True\n",
+    "    torch.backends.cudnn.benchmark = False\n",
+    "\n",
+    "seed_everything(42)\n",
+    "# Sinusoidal timestep embedding for diffusion steps\n",
+    "def get_timestep_embedding(timesteps, embedding_dim):\n",
+    "    half_dim = embedding_dim // 2\n",
+    "    emb = torch.exp(\n",
+    "        torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) *\n",
+    "        -(torch.log(torch.tensor(10000.0)) / half_dim)\n",
+    "    )\n",
+    "    emb = timesteps.float()[:, None] * emb[None, :]\n",
+    "    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)\n",
+    "    if embedding_dim % 2 == 1:  # Handle odd embedding dimensions\n",
+    "        emb = torch.cat([emb, torch.zeros_like(emb[:, :1])], dim=1)\n",
+    "    return emb\n",
+    "\n",
+    "# Residual block with time and context embeddings\n",
+    "class ResidualBlock(nn.Module):\n",
+    "    def __init__(self, in_channels, out_channels, time_emb_dim, context_dim=None):\n",
+    "        super().__init__()\n",
+    "        self.norm1 = nn.GroupNorm(min(32, in_channels), in_channels)\n",
+    "        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, padding=1)\n",
+    "        self.norm2 = nn.GroupNorm(min(32, out_channels), out_channels)\n",
+    "        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)\n",
+    "        self.time_mlp = nn.Linear(time_emb_dim, out_channels)\n",
+    "        self.context_proj = nn.Linear(context_dim, out_channels) if context_dim else None\n",
+    "        self.shortcut = nn.Conv2d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()\n",
+    "\n",
+    "    def forward(self, x, t_emb, context=None):\n",
+    "        h = self.norm1(x)\n",
+    "        h = F.silu(h)\n",
+    "        h = self.conv1(h)\n",
+    "\n",
+    "        # Add time embedding\n",
+    "        t_proj = self.time_mlp(t_emb)[:, :, None, None]\n",
+    "        h = h + t_proj\n",
+    "\n",
+    "        # Add context embedding if available\n",
+    "        if self.context_proj is not None and context is not None:\n",
+    "            context_pooled = context.mean(dim=1)  # [batch, context_dim]\n",
+    "            context_proj = self.context_proj(context_pooled)[:, :, None, None]\n",
+    "            h = h + context_proj\n",
+    "\n",
+    "        h = self.norm2(h)\n",
+    "        h = F.silu(h)\n",
+    "        h = self.conv2(h)\n",
+    "\n",
+    "        return h + self.shortcut(x)\n",
+    "\n",
+    "# Cross-attention to integrate text embeddings\n",
+    "class CrossAttention(nn.Module):\n",
+    "    def __init__(self, channels, context_dim):\n",
+    "        super().__init__()\n",
+    "        self.channels = channels\n",
+    "        self.query = nn.Linear(channels, channels)\n",
+    "        self.key = nn.Linear(context_dim, channels)\n",
+    "        self.value = nn.Linear(context_dim, channels)\n",
+    "        self.out = nn.Linear(channels, channels)\n",
+    "        self.norm = nn.LayerNorm(channels)\n",
+    "\n",
+    "    def forward(self, x, context):\n",
+    "        if context is None:\n",
+    "            return x\n",
+    "\n",
+    "        B, C, H, W = x.shape\n",
+    "        x_flat = x.permute(0, 2, 3, 1).reshape(B, H * W, C)\n",
+    "        x_norm = self.norm(x_flat)\n",
+    "\n",
+    "        q = self.query(x_norm)  # [B, H*W, C]\n",
+    "        k = self.key(context)   # [B, seq_len, C]\n",
+    "        v = self.value(context) # [B, seq_len, C]\n",
+    "\n",
+    "        scale = (C ** -0.5)\n",
+    "        attn_weights = torch.bmm(q, k.transpose(1, 2)) * scale\n",
+    "        attn_weights = F.softmax(attn_weights, dim=-1)\n",
+    "        attn_out = torch.bmm(attn_weights, v)\n",
+    "        attn_out = self.out(attn_out)\n",
+    "\n",
+    "        attn_out = attn_out.reshape(B, H, W, C).permute(0, 3, 1, 2)\n",
+    "        return x + attn_out\n",
+    "\n",
+    "# Self-attention block for image features\n",
+    "class AttentionBlock(nn.Module):\n",
+    "    def __init__(self, channels):\n",
+    "        super().__init__()\n",
+    "        self.norm = nn.GroupNorm(min(32, channels), channels)\n",
+    "        self.qkv = nn.Conv2d(channels, channels * 3, 1)\n",
+    "        self.proj = nn.Conv2d(channels, channels, 1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        B, C, H, W = x.shape\n",
+    "        h = self.norm(x)\n",
+    "        qkv = self.qkv(h).reshape(B, 3, C, H * W)\n",
+    "        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]\n",
+    "\n",
+    "        scale = (C ** -0.5)\n",
+    "        attn = torch.bmm(q.transpose(1, 2), k) * scale\n",
+    "        attn = F.softmax(attn, dim=-1)\n",
+    "\n",
+    "        out = torch.bmm(v, attn.transpose(1, 2))\n",
+    "        out = out.reshape(B, C, H, W)\n",
+    "        return self.proj(out) + x\n",
+    "\n",
+    "# U-Net model updated for 256x256 latents\n",
+    "class UNetConditional(nn.Module):\n",
+    "    def __init__(self, in_channels=4, base_channels=128, context_dim=768):\n",
+    "        super().__init__()\n",
+    "        self.time_emb_dim = base_channels * 4\n",
+    "        from types import SimpleNamespace\n",
+    "        self.config = SimpleNamespace()\n",
+    "        self.config._diffusers_version = \"0.34.0\"\n",
+    "        self.config.in_channels = in_channels\n",
+    "        self.config.out_channels = in_channels\n",
+    "        self.config.sample_size = 256  # Updated for 256x256 latents\n",
+    "        self.config.layers_per_block = 2\n",
+    "        self.config.block_out_channels = [base_channels, base_channels * 2, base_channels * 4, base_channels * 8]\n",
+    "        self.config.attention_head_dim = 8\n",
+    "        self.config.cross_attention_dim = context_dim\n",
+    "\n",
+    "        # Time embedding MLP\n",
+    "        self.time_mlp = nn.Sequential(\n",
+    "            nn.Linear(base_channels, self.time_emb_dim),\n",
+    "            nn.SiLU(),\n",
+    "            nn.Linear(self.time_emb_dim, self.time_emb_dim),\n",
+    "        )\n",
+    "\n",
+    "        # Input projection\n",
+    "        self.input_conv = nn.Conv2d(in_channels, base_channels, 3, padding=1)\n",
+    "\n",
+    "        # Encoder\n",
+    "        self.down1 = ResidualBlock(base_channels, base_channels * 2, self.time_emb_dim, context_dim)\n",
+    "        self.downsample1 = nn.Conv2d(base_channels * 2, base_channels * 2, 3, stride=2, padding=1)\n",
+    "        self.cross1 = CrossAttention(base_channels * 2, context_dim)\n",
+    "\n",
+    "        self.down2 = ResidualBlock(base_channels * 2, base_channels * 4, self.time_emb_dim, context_dim)\n",
+    "        self.downsample2 = nn.Conv2d(base_channels * 4, base_channels * 4, 3, stride=2, padding=1)\n",
+    "        self.cross2 = CrossAttention(base_channels * 4, context_dim)\n",
+    "\n",
+    "        self.down3 = ResidualBlock(base_channels * 4, base_channels * 8, self.time_emb_dim, context_dim)\n",
+    "        self.downsample3 = nn.Conv2d(base_channels * 8, base_channels * 8, 3, stride=2, padding=1)\n",
+    "        self.cross3 = CrossAttention(base_channels * 8, context_dim)\n",
+    "\n",
+    "        # Middle\n",
+    "        self.middle1 = ResidualBlock(base_channels * 8, base_channels * 8, self.time_emb_dim, context_dim)\n",
+    "        self.middle_attn = AttentionBlock(base_channels * 8)\n",
+    "        self.middle2 = ResidualBlock(base_channels * 8, base_channels * 8, self.time_emb_dim, context_dim)\n",
+    "\n",
+    "        # Decoder\n",
+    "        self.up3 = ResidualBlock(base_channels * 16, base_channels * 4, self.time_emb_dim, context_dim)\n",
+    "        self.upsample3 = nn.ConvTranspose2d(base_channels * 4, base_channels * 4, 4, stride=2, padding=1)\n",
+    "        self.cross_up3 = CrossAttention(base_channels * 4, context_dim)\n",
+    "\n",
+    "        self.up2 = ResidualBlock(base_channels * 8, base_channels * 2, self.time_emb_dim, context_dim)\n",
+    "        self.upsample2 = nn.ConvTranspose2d(base_channels * 2, base_channels * 2, 4, stride=2, padding=1)\n",
+    "        self.cross_up2 = CrossAttention(base_channels * 2, context_dim)\n",
+    "\n",
+    "        self.up1 = ResidualBlock(base_channels * 4, base_channels, self.time_emb_dim, context_dim)\n",
+    "        self.upsample1 = nn.ConvTranspose2d(base_channels, base_channels, 4, stride=2, padding=1)\n",
+    "\n",
+    "        # Output\n",
+    "        self.output_conv = nn.Sequential(\n",
+    "            nn.GroupNorm(min(32, base_channels), base_channels),\n",
+    "            nn.SiLU(),\n",
+    "            nn.Conv2d(base_channels, in_channels, 3, padding=1)\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x, t, context, cfg_scale=1.0):\n",
+    "        t_emb = get_timestep_embedding(t, self.time_emb_dim // 4)\n",
+    "        t_emb = self.time_mlp(t_emb)\n",
+    "\n",
+    "        def denoise(x, t_emb, context):\n",
+    "            h = self.input_conv(x)\n",
+    "\n",
+    "            # Encoder\n",
+    "            h1 = self.down1(h, t_emb, context)\n",
+    "            h1_cross = self.cross1(h1, context)\n",
+    "            h1_down = self.downsample1(h1_cross)\n",
+    "\n",
+    "            h2 = self.down2(h1_down, t_emb, context)\n",
+    "            h2_cross = self.cross2(h2, context)\n",
+    "            h2_down = self.downsample2(h2_cross)\n",
+    "\n",
+    "            h3 = self.down3(h2_down, t_emb, context)\n",
+    "            h3_cross = self.cross3(h3, context)\n",
+    "            h3_down = self.downsample3(h3_cross)\n",
+    "\n",
+    "            # Middle\n",
+    "            h_mid = self.middle1(h3_down, t_emb, context)\n",
+    "            h_mid = self.middle_attn(h_mid)\n",
+    "            h_mid = self.middle2(h_mid, t_emb, context)\n",
+    "\n",
+    "            # Decoder\n",
+    "            h3_cross_resized = F.interpolate(h3_cross, size=h_mid.shape[-2:], mode='nearest')\n",
+    "            h = self.up3(torch.cat([h_mid, h3_cross_resized], dim=1), t_emb, context)\n",
+    "            h = self.upsample3(h)\n",
+    "            h = self.cross_up3(h, context)\n",
+    "\n",
+    "            h2_cross_resized = F.interpolate(h2_cross, size=h.shape[-2:], mode='nearest')\n",
+    "            h = self.up2(torch.cat([h, h2_cross_resized], dim=1), t_emb, context)\n",
+    "            h = self.upsample2(h)\n",
+    "            h = self.cross_up2(h, context)\n",
+    "\n",
+    "            h1_cross_resized = F.interpolate(h1_cross, size=h.shape[-2:], mode='nearest')\n",
+    "            h = self.up1(torch.cat([h, h1_cross_resized], dim=1), t_emb, context)\n",
+    "            h = self.upsample1(h)\n",
+    "\n",
+    "            return self.output_conv(h)\n",
+    "\n",
+    "        if cfg_scale == 1.0 or context is None:\n",
+    "            return denoise(x, t_emb, context)\n",
+    "\n",
+    "        uncond = denoise(x, t_emb, context=None)\n",
+    "        cond = denoise(x, t_emb, context)\n",
+    "        return uncond + cfg_scale * (cond - uncond)\n",
+    "import torch\n",
+    "from diffusers import AutoencoderKL, DDPMScheduler\n",
+    "from transformers import CLIPTextModel, CLIPTokenizer\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "import argparse\n",
+    "import sys\n",
+    "\n",
+    "\n",
+    "\n",
+    "def seed_everything(seed):\n",
+    "    torch.manual_seed(seed)\n",
+    "    torch.cuda.manual_seed_all(seed)\n",
+    "    np.random.seed(seed)\n",
+    "\n",
+    "def generate_images_direct(unet_path=\"KahabMinGenT2Im-v1.pt\", device=\"cuda\", output_dir=\"output\", prompt=None,timesteps=50):\n",
+    "    \"\"\"Generate 256x256 images with a custom UNet and user-specified text prompt\"\"\"\n",
+    "    seed_everything(42)\n",
+    "    print(f\"Using device: {device}\")\n",
+    "\n",
+    "    # Load components\n",
+    "    print(\"Loading VAE...\")\n",
+    "    vae = AutoencoderKL.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"vae\").to(device).eval().requires_grad_(False)\n",
+    "\n",
+    "    print(\"Loading tokenizer and text encoder...\")\n",
+    "    tokenizer = CLIPTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+    "    text_encoder = CLIPTextModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device).eval().requires_grad_(False)\n",
+    "\n",
+    "    print(\"Loading trained UNet...\")\n",
+    "    unet = UNetConditional(in_channels=4, base_channels=128, context_dim=768)\n",
+    "    checkpoint = torch.load(unet_path, map_location=device, weights_only=True)\n",
+    "    unet.load_state_dict(checkpoint['model_state_dict'])\n",
+    "    unet = unet.to(device).eval()\n",
+    "\n",
+    "    # Create scheduler\n",
+    "    scheduler = DDPMScheduler(num_train_timesteps=1000)\n",
+    "\n",
+    "    # Get prompt from user if not provided\n",
+    "    if prompt is None:\n",
+    "        # Check if running in Jupyter\n",
+    "        if 'ipykernel' in sys.modules:\n",
+    "            prompt = input(\"Enter your text prompt (e.g., 'A friendly dragon'): \").strip()\n",
+    "        else:\n",
+    "            prompt = \"\"  # Will be handled by argparse default or user input\n",
+    "        if not prompt:\n",
+    "            prompt = \"A friendly dragon\"  # Default prompt if empty\n",
+    "\n",
+    "    test_prompts = [prompt]\n",
+    "\n",
+    "    print(\"🎨 Generating 256x256 images...\")\n",
+    "    for i, prompt in enumerate(test_prompts):\n",
+    "        print(f\"Generating: {prompt}\")\n",
+    "        try:\n",
+    "            with torch.no_grad():\n",
+    "                # Encode prompt\n",
+    "                inputs = tokenizer(\n",
+    "                    prompt,\n",
+    "                    padding=\"max_length\",\n",
+    "                    truncation=True,\n",
+    "                    max_length=77,\n",
+    "                    return_tensors=\"pt\"\n",
+    "                )\n",
+    "                inputs = {k: v.to(device) for k, v in inputs.items()}\n",
+    "                text_embeddings = text_encoder(**inputs).last_hidden_state\n",
+    "                print(f\"Text embeddings shape: {text_embeddings.shape}, device: {text_embeddings.device}\")\n",
+    "\n",
+    "                # Create random latents for 256x256 output (256/8 = 32 due to VAE scaling)\n",
+    "                latents = torch.randn(1, 4, 32, 32, device=device, dtype=torch.float32)\n",
+    "                print(f\"Initial latents shape: {latents.shape}, device: {latents.device}\")\n",
+    "\n",
+    "                # Set timesteps\n",
+    "                scheduler.set_timesteps(timesteps)\n",
+    "\n",
+    "                # Denoising loop\n",
+    "                for t in tqdm(scheduler.timesteps, desc=f\"Denoising {prompt}\"):\n",
+    "                    t_tensor = torch.tensor([t], device=device, dtype=torch.long)\n",
+    "                    noise_pred = unet(latents, t_tensor, context=text_embeddings)\n",
+    "                    latents = scheduler.step(noise_pred, t, latents).prev_sample\n",
+    "\n",
+    "                print(f\"Final latents shape: {latents.shape}\")\n",
+    "\n",
+    "                # Decode latents to image\n",
+    "                latents = latents / 0.18215\n",
+    "                images = vae.decode(latents).sample\n",
+    "                images = (images / 2 + 0.5).clamp(0, 1)  # Denormalize\n",
+    "                images = images.cpu().permute(0, 2, 3, 1).numpy()\n",
+    "                image = Image.fromarray((images[0] * 255).astype(np.uint8))\n",
+    "\n",
+    "                # Save\n",
+    "                filename = f\"{output_dir}/generated_256_{i+1}_{prompt.replace(' ', '_')}.png\"\n",
+    "                image.save(filename)\n",
+    "                print(f\"✅ Saved: {filename}\")\n",
+    "\n",
+    "        except Exception as e:\n",
+    "            print(f\"❌ Error generating '{prompt}': {e}\")\n",
+    "            print(f\"Error type: {type(e).__name__}\")\n",
+    "            continue\n",
+    "\n",
+    "def main():\n",
+    "    # Check if running in Jupyter\n",
+    "    if 'ipykernel' in sys.modules:\n",
+    "        generate_images_direct(\n",
+    "            unet_path=\"KahabMinGenT2Im-v1.pt\",\n",
+    "            device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
+    "            output_dir=\"output\",\n",
+    "            prompt=None\n",
+    "        )\n",
+    "    else:\n",
+    "        parser = argparse.ArgumentParser(description=\"Generate images with custom UNet and text prompt\")\n",
+    "        parser.add_argument(\"--unet_path\", type=str, default=\"KahabMinGenT2Im-v1.pt\", help=\"Path to UNet checkpoint\")\n",
+    "        parser.add_argument(\"--device\", type=str, default=\"cuda\" if torch.cuda.is_available() else \"cpu\", help=\"Device to use (cuda or cpu)\")\n",
+    "        parser.add_argument(\"--output_dir\", type=str, default=\"output\", help=\"Output directory for generated images\")\n",
+    "        parser.add_argument(\"--prompt\", type=str, default=None, help=\"Text prompt for image generation\")\n",
+    "        args = parser.parse_args()\n",
+    "\n",
+    "        generate_images_direct(\n",
+    "            unet_path=args.unet_path,\n",
+    "            device=args.device,\n",
+    "            output_dir=args.output_dir,\n",
+    "            prompt=args.prompt\n",
+    "        )\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a86f43b-1e8e-4ead-bcf5-c8ff9f065782",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}