alpha31476 commited on May 15, 2025

Commit

cb656a6

verified ·

1 Parent(s): 87ef7b5

SDFT

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
Vaani/SDFT/_2.ipynb +675 -0
Vaani/SDFT/_2_.py +345 -0
Vaani/SDFT/_2_DDP.py +316 -0
Vaani/SDFT/checkpoints/checkpoint.pth +3 -0
Vaani/SDFT/download_model.py +13 -0
Vaani/SDFT/vaani-stablediffusion-finetune-kaggle.ipynb +650 -0
Vaani/VaaniLDM/ddpm_ckpt_epoch31.pt +3 -0
Vaani/VaaniLDM/ddpm_ckpt_epoch32.pt +3 -0
Vaani/VaaniLDM/ldmH_ckpt_epoch24.pt +3 -0
Vaani/VaaniLDM/ldmH_ckpt_epoch25.pt +3 -0
Vaani/VaaniLDM/samples/x0_0.png +2 -2
Vaani/VaaniLDM/samples/x0_1.png +0 -0
Vaani/VaaniLDM/samples/x0_10.png +0 -0
Vaani/VaaniLDM/samples/x0_100.png +0 -0
Vaani/VaaniLDM/samples/x0_101.png +0 -0
Vaani/VaaniLDM/samples/x0_102.png +0 -0
Vaani/VaaniLDM/samples/x0_103.png +0 -0
Vaani/VaaniLDM/samples/x0_104.png +0 -0
Vaani/VaaniLDM/samples/x0_105.png +0 -0
Vaani/VaaniLDM/samples/x0_106.png +0 -0
Vaani/VaaniLDM/samples/x0_107.png +0 -0
Vaani/VaaniLDM/samples/x0_108.png +0 -0
Vaani/VaaniLDM/samples/x0_109.png +0 -0
Vaani/VaaniLDM/samples/x0_11.png +0 -0
Vaani/VaaniLDM/samples/x0_110.png +0 -0
Vaani/VaaniLDM/samples/x0_111.png +0 -0
Vaani/VaaniLDM/samples/x0_112.png +0 -0
Vaani/VaaniLDM/samples/x0_113.png +0 -0
Vaani/VaaniLDM/samples/x0_114.png +0 -0
Vaani/VaaniLDM/samples/x0_115.png +0 -0
Vaani/VaaniLDM/samples/x0_116.png +0 -0
Vaani/VaaniLDM/samples/x0_117.png +0 -0
Vaani/VaaniLDM/samples/x0_118.png +0 -0
Vaani/VaaniLDM/samples/x0_119.png +0 -0
Vaani/VaaniLDM/samples/x0_12.png +0 -0
Vaani/VaaniLDM/samples/x0_120.png +0 -0
Vaani/VaaniLDM/samples/x0_121.png +0 -0
Vaani/VaaniLDM/samples/x0_122.png +0 -0
Vaani/VaaniLDM/samples/x0_123.png +0 -0
Vaani/VaaniLDM/samples/x0_124.png +0 -0
Vaani/VaaniLDM/samples/x0_125.png +0 -0
Vaani/VaaniLDM/samples/x0_126.png +0 -0
Vaani/VaaniLDM/samples/x0_127.png +0 -0
Vaani/VaaniLDM/samples/x0_128.png +0 -0
Vaani/VaaniLDM/samples/x0_129.png +0 -0
Vaani/VaaniLDM/samples/x0_13.png +0 -0
Vaani/VaaniLDM/samples/x0_130.png +0 -0
Vaani/VaaniLDM/samples/x0_131.png +0 -0
Vaani/VaaniLDM/samples/x0_132.png +0 -0

.gitattributes CHANGED Viewed

@@ -135,3 +135,4 @@ Vaani/output_image2.png filter=lfs diff=lfs merge=lfs -text
 Vaani/sampleJSON.csv filter=lfs diff=lfs merge=lfs -text
 Vaani/sampleJSON.json filter=lfs diff=lfs merge=lfs -text
 tools/__pycache__/pynvml.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text

 Vaani/sampleJSON.csv filter=lfs diff=lfs merge=lfs -text
 Vaani/sampleJSON.json filter=lfs diff=lfs merge=lfs -text
 tools/__pycache__/pynvml.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
+Vaani/VaaniLDM/samplesH/x0_0.png filter=lfs diff=lfs merge=lfs -text

Vaani/SDFT/_2.ipynb ADDED Viewed

	@@ -0,0 +1,675 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aab59bea",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cuda'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.optim as optim\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torchvision import transforms\n",
+    "from torchvision.transforms import v2\n",
+    "from PIL import Image\n",
+    "from diffusers import StableDiffusionPipeline\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import trange, tqdm\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8f13b66f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import torch\n",
+    "# import torch.nn as nn\n",
+    "# import torch.nn.functional as F\n",
+    "\n",
+    "# audio_embed_dim = 1280\n",
+    "# output_dim = 768\n",
+    "# device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "# context_projector = nn.Sequential(\n",
+    "#     nn.Linear(audio_embed_dim, 320),\n",
+    "#     nn.SiLU(),\n",
+    "#     nn.Linear(320, output_dim)\n",
+    "# ).to(device).half()\n",
+    "\n",
+    "# # Dummy input\n",
+    "# audio_embedding = dummy_audio = torch.zeros(10, 1500, 1280, device=device, dtype=torch.float16)\n",
+    "# print(audio_embedding.shape)  # [10, 1500, 1280]\n",
+    "\n",
+    "# # Project audio to [10, 1500, 768]\n",
+    "# projected = context_projector(audio_embedding)\n",
+    "# print(projected.shape)  # [10, 1500, 768]\n",
+    "\n",
+    "# # Compute attention scores: reduce feature dim to scalar per time step\n",
+    "# attn_scores = projected.mean(dim=2)               # [10, 1500]\n",
+    "# attn_weights = F.softmax(attn_scores, dim=1)      # [10, 1500]\n",
+    "# attn_weights = attn_weights.unsqueeze(2)          # [10, 1500, 1]\n",
+    "\n",
+    "# # Weighted average\n",
+    "# pooled = (projected * attn_weights).sum(dim=1, keepdim=True)  # [10, 1, 768]\n",
+    "# print(pooled.shape)  # Final shape: [10, 1, 768]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d32b7d9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# === Helpers ===\n",
+    "def walkDIR(folder_path, include=None):\n",
+    "    file_list = []\n",
+    "    for root, _, files in os.walk(folder_path):\n",
+    "        for file in files:\n",
+    "            if include is None or any(file.endswith(ext) for ext in include):\n",
+    "                file_list.append(os.path.join(root, file))\n",
+    "    print(\"Files found:\", len(file_list))\n",
+    "    return file_list\n",
+    "\n",
+    "# === Dataset Class ===\n",
+    "class VaaniDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, files_paths, im_size):\n",
+    "        self.files_paths = files_paths\n",
+    "        self.im_size = im_size\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.files_paths)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        # image = tv.io.read_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)\n",
+    "        image = Image.open(self.files_paths[idx]).convert(\"RGB\")\n",
+    "        image = v2.ToImage()(image)\n",
+    "        # image = tv.io.decode_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)\n",
+    "        image = v2.Resize((self.im_size, self.im_size))(image)\n",
+    "        image = v2.ToDtype(torch.float32, scale=True)(image)\n",
+    "        # image = 2*image - 1\n",
+    "        return image\n",
+    "\n",
+    "\n",
+    "def create_dataloader(dataset, batch_size, debug=False, val_split=0.1, num_workers=4):\n",
+    "    if debug:\n",
+    "        s = 0.001\n",
+    "        dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))\n",
+    "        print(\"Length of Train dataset:\", len(dataset))\n",
+    "\n",
+    "    train_dataloader = DataLoader(\n",
+    "        dataset, \n",
+    "        batch_size=batch_size, \n",
+    "        shuffle=True, \n",
+    "        num_workers=num_workers,\n",
+    "        pin_memory=True,\n",
+    "        drop_last=True,\n",
+    "        persistent_workers=True\n",
+    "    )\n",
+    "    \n",
+    "    images = next(iter(train_dataloader))\n",
+    "    print('Total Batches:', len(train_dataloader))\n",
+    "    print('BATCH SHAPE:', images.shape)\n",
+    "    return train_dataloader\n",
+    "\n",
+    "# === Audio Context Projector ===\n",
+    "# class AudioContextProjector(nn.Module):\n",
+    "#     def __init__(self, audio_embed_dim):\n",
+    "#         super().__init__()\n",
+    "#         self.audio_embed_dim = audio_embed_dim\n",
+    "#         self.context_projector = nn.Sequential(\n",
+    "#             nn.Linear(audio_embed_dim, 320),\n",
+    "#             nn.SiLU(),\n",
+    "#             nn.Linear(320, 1)\n",
+    "#         )\n",
+    "\n",
+    "#     def forward(self, audio_embedding):\n",
+    "#         if audio_embedding.size(-1) != self.audio_embed_dim:\n",
+    "#           raise ValueError(f\"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}\")\n",
+    "#         weights = self.context_projector(audio_embedding)  # [B, T, 1]\n",
+    "#         weights = torch.softmax(weights, dim=1)            # [B, T, 1]\n",
+    "#         pooled = (audio_embedding * weights).sum(dim=1)    # [B, 1280]\n",
+    "#         return pooled.unsqueeze(1)                         # [B, 1, 1280]\n",
+    "# class AudioContextProjector(nn.Module):\n",
+    "#     def __init__(self, audio_embed_dim=1280, output_dim=768):  # Add output_dim for flexibility\n",
+    "#         super().__init__()\n",
+    "#         self.audio_embed_dim = audio_embed_dim\n",
+    "#         self.context_projector = nn.Sequential(\n",
+    "#             nn.Linear(audio_embed_dim, 320),\n",
+    "#             nn.SiLU(),\n",
+    "#             nn.Linear(320, output_dim)  # Output 768 to match UNet's expectation\n",
+    "#         )\n",
+    "\n",
+    "#     def forward(self, audio_embedding):\n",
+    "#         if audio_embedding.size(-1) != self.audio_embed_dim:\n",
+    "#             raise ValueError(f\"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}\")\n",
+    "#         weights = self.context_projector(audio_embedding)  # [B, T, 768]\n",
+    "#         weights = torch.softmax(pooled, dim=1)            # [B, T, 768]\n",
+    "#         pooled = (audio_embedding * weights).sum(dim=1)    # [B, 768]\n",
+    "#         return pooled.unsqueeze(1)                         # [B, 1, 768]\n",
+    "class AudioContextProjector(nn.Module):\n",
+    "    def __init__(self, audio_embed_dim=1280, output_dim=768):\n",
+    "        super().__init__()\n",
+    "        self.audio_embed_dim = audio_embed_dim\n",
+    "        self.output_dim = output_dim\n",
+    "        self.context_projector = nn.Sequential(\n",
+    "            nn.Linear(audio_embed_dim, 320),\n",
+    "            nn.SiLU(),\n",
+    "            nn.Linear(320, output_dim)  # Output 768 to match UNet's expectation\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, audio_embedding):\n",
+    "        if audio_embedding.size(-1) != self.audio_embed_dim:\n",
+    "            raise ValueError(f\"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}\")\n",
+    "\n",
+    "        # Project to [B, T, 768]\n",
+    "        projected = self.context_projector(audio_embedding)  # [B, T, 768]\n",
+    "\n",
+    "        # Compute scalar attention scores per timestep\n",
+    "        attn_scores = projected.mean(dim=2)                 # [B, T]\n",
+    "        attn_weights = F.softmax(attn_scores, dim=1)        # [B, T]\n",
+    "        attn_weights = attn_weights.unsqueeze(2)            # [B, T, 1]\n",
+    "\n",
+    "        # Apply attention to the projected embeddings\n",
+    "        pooled = (projected * attn_weights).sum(dim=1, keepdim=True)  # [B, 1, 768]\n",
+    "        return pooled\n",
+    "\n",
+    "\n",
+    "\n",
+    "# === Inference Function ===\n",
+    "def run_inference(pipe, unet, vae, device, context_hidden_states, save_path=\"inference_output.png\"):\n",
+    "    pipe.unet = unet\n",
+    "    pipe.vae = vae\n",
+    "    pipe.to(device)\n",
+    "\n",
+    "    batch_size = 1\n",
+    "    latents = torch.randn((batch_size, pipe.unet.in_channels, 64, 64), device=device, dtype=torch.float16)\n",
+    "    # latents = torch.randn((batch_size, pipe.unet.config.in_channels, 64, 64), device=device, dtype=torch.float16)\n",
+    "    pipe.scheduler.set_timesteps(50)\n",
+    "    latents = latents * pipe.scheduler.init_noise_sigma\n",
+    "  \n",
+    "    expected_shape = (batch_size, 1, 768)  # Adjust based on model\n",
+    "    if context_hidden_states.shape != expected_shape:\n",
+    "        raise ValueError(f\"Expected context_hidden_states shape {expected_shape}, got {context_hidden_states.shape}\")\n",
+    "      \n",
+    "    for t in pipe.scheduler.timesteps:\n",
+    "        with torch.no_grad():\n",
+    "            noise_pred = pipe.unet(latents, t, encoder_hidden_states=context_hidden_states).sample\n",
+    "        latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample\n",
+    "\n",
+    "    # latents = 1 / 0.18215 * latents\n",
+    "    latents = 1 / pipe.vae.config.scaling_factor * latents\n",
+    "    with torch.no_grad():\n",
+    "        image = pipe.vae.decode(latents).sample\n",
+    "\n",
+    "    image = (image / 2 + 0.5).clamp(0, 1)\n",
+    "    image = image.cpu().permute(0, 2, 3, 1).numpy()[0]\n",
+    "    image = Image.fromarray((image * 255).astype(\"uint8\"))\n",
+    "    image.save(save_path)\n",
+    "    print(f\"Inference image saved to {save_path}\")\n",
+    "\n",
+    "\n",
+    "# === Load Pipeline ===\n",
+    "def load_pipeline(model_id, device):\n",
+    "    pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)\n",
+    "    unet = pipe.unet\n",
+    "    vae = pipe.vae\n",
+    "    return pipe, unet, vae\n",
+    "\n",
+    "# === Freeze Layers Function ===\n",
+    "def freeze_vae_layers(vae):\n",
+    "    vae.encoder.requires_grad_(False)\n",
+    "    vae.quant_conv.requires_grad_(False)\n",
+    "    vae.decoder.requires_grad_(True)\n",
+    "    vae.post_quant_conv.requires_grad_(True)\n",
+    "\n",
+    "def freeze_unet_layers(unet):\n",
+    "    for name, param in unet.named_parameters():\n",
+    "        if \"attn2\" in name or \"conv2\" in name:\n",
+    "            param.requires_grad = True\n",
+    "        else:\n",
+    "            param.requires_grad = False\n",
+    "\n",
+    "# === Optimizer Setup ===\n",
+    "def setup_optimizer(vae, unet, projector, lr):\n",
+    "    params_to_optimize = list(filter(lambda p: p.requires_grad, vae.parameters())) + \\\n",
+    "                         list(filter(lambda p: p.requires_grad, unet.parameters())) + \\\n",
+    "                         list(filter(lambda p: p.requires_grad, projector.parameters()))\n",
+    "    optimizer = optim.AdamW(params_to_optimize, lr=lr)\n",
+    "    return optimizer\n",
+    "\n",
+    "\n",
+    "# === Gradient Accumulation Function ===\n",
+    "def accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader):\n",
+    "    loss = loss / gradient_accumulation_steps\n",
+    "    loss.backward()\n",
+    "    if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(dataloader):\n",
+    "        optimizer.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "# === Save Checkpoint Function ===\n",
+    "def save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path):\n",
+    "    # checkpoint_path = f\"{save_dir}/checkpoint.pth\"\n",
+    "    torch.save({\n",
+    "        'epoch': epoch,\n",
+    "        'unet': unet.state_dict(),\n",
+    "        'vae': vae.state_dict(),\n",
+    "        'projector': projector.state_dict(),\n",
+    "        'optimizer': optimizer.state_dict(),\n",
+    "    }, checkpoint_path)\n",
+    "    print(f\"Checkpoint saved to {checkpoint_path}\")\n",
+    "\n",
+    "# === Resume from Checkpoint Function ===\n",
+    "def resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer):\n",
+    "    if os.path.exists(checkpoint_path):\n",
+    "        checkpoint = torch.load(checkpoint_path, map_location='cpu')\n",
+    "        unet.load_state_dict(checkpoint['unet'])\n",
+    "        vae.load_state_dict(checkpoint['vae'])\n",
+    "        projector.load_state_dict(checkpoint['projector'])\n",
+    "        optimizer.load_state_dict(checkpoint['optimizer'])\n",
+    "        start_epoch = checkpoint['epoch'] + 1\n",
+    "        print(f\"Resuming training from epoch {start_epoch}...\")\n",
+    "        return start_epoch\n",
+    "    else:\n",
+    "        print(\"No checkpoint found, starting from scratch.\")\n",
+    "        return 0\n",
+    "\n",
+    "\n",
+    "# === Training Loop Function ===\n",
+    "def train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector):\n",
+    "    start_epoch = resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer)\n",
+    "\n",
+    "    for epoch in trange(start_epoch, num_epochs, colour='red', desc=f'{device}-training', ncols=100):\n",
+    "        unet.train()\n",
+    "        vae.train()\n",
+    "        projector.train()\n",
+    "        total_loss = 0\n",
+    "        step = 0\n",
+    "        \n",
+    "        for image in tqdm(dataloader, colour='green', desc=f'{device}-batch', ncols=100):\n",
+    "            # print(\"step:\", step)\n",
+    "            image = image.to(device, dtype=torch.float16)\n",
+    "\n",
+    "            latents = vae.encode(image).latent_dist.sample() * 0.18215\n",
+    "            noise = torch.randn_like(latents)\n",
+    "            # timesteps = torch.randint(0, 1000, (latents.shape[0],), device=device).long()\n",
+    "            timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()\n",
+    "\n",
+    "            # === Use dummy audio embedding ===\n",
+    "            dummy_audio = torch.zeros(image.size(0), 1500, 1280, device=device, dtype=torch.float16)\n",
+    "            context_hidden_states = projector(dummy_audio)\n",
+    "\n",
+    "            # print(\"Model IP\")\n",
+    "            noise_pred = unet(latents + noise, timesteps, encoder_hidden_states=context_hidden_states).sample\n",
+    "            # print(\"Model OP\")\n",
+    "\n",
+    "            loss = nn.MSELoss()(noise_pred, noise)\n",
+    "            total_loss += loss.item()\n",
+    "\n",
+    "            step += 1\n",
+    "            accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader)\n",
+    "\n",
+    "        avg_loss = total_loss / len(dataloader)\n",
+    "        print(f\"Epoch {epoch + 1} | Avg Loss: {avg_loss:.6f}\")\n",
+    "\n",
+    "        save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path)\n",
+    "        run_inference(pipe, unet, vae, device, context_hidden_states, save_path=f\"{samples_path}/inference_epoch{epoch + 1}.png\")\n",
+    "\n",
+    "    print(\"\\n✅ Fine-tuning complete.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9ad5f6a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# === Main Function ===\n",
+    "def main():\n",
+    "    model_id = \"runwayml/stable-diffusion-v1-5\"\n",
+    "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "    lr = 1e-5\n",
+    "    num_epochs = 10\n",
+    "    batch_size = 16\n",
+    "    debug = False\n",
+    "    gradient_accumulation_steps = 1\n",
+    "    \n",
+    "    os.makedirs(f\"./checkpoints\", exist_ok=True)\n",
+    "    os.makedirs(f\"./samples\", exist_ok=True)\n",
+    "    checkpoint_path = f\"./checkpoints/checkpoint.pth\"\n",
+    "    samples_path = f\"./samples\"\n",
+    "    image_dir = \"/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images\"\n",
+    "\n",
+    "    pipe, unet, vae = load_pipeline(model_id, device)\n",
+    "    freeze_vae_layers(vae)\n",
+    "    freeze_unet_layers(unet)\n",
+    "    projector = AudioContextProjector(audio_embed_dim=1280, output_dim=768).to(device).half()\n",
+    "    optimizer = setup_optimizer(vae, unet, projector, lr)\n",
+    "\n",
+    "    # === Dataset & Dataloader ===\n",
+    "    files = walkDIR(image_dir, include=['.png', '.jpeg', '.jpg'])\n",
+    "    dataset = VaaniDataset(files_paths=files, im_size=256)\n",
+    "    image = dataset[2]\n",
+    "    print('IMAGE SHAPE:', image.shape, \"Dataset len:\", len(dataset))\n",
+    "    dataloader = create_dataloader(dataset, batch_size, debug=debug)\n",
+    "\n",
+    "    train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e71b4ba9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Couldn't connect to the Hub: (MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/models/runwayml/stable-diffusion-v1-5 (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7fd9a9445c40>: Failed to resolve \\'huggingface.co\\' ([Errno -2] Name or service not known)\"))'), '(Request ID: bcd4fcc3-8634-4bfe-8454-3b4dbdcc1222)').\n",
+      "Will try to load from local cache.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1014662fa9c44a00b0e9e6b3d1e9747d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files found: 128807\n",
+      "IMAGE SHAPE: torch.Size([3, 256, 256]) Dataset len: 128807\n",
+      "Total Batches: 8050\n",
+      "BATCH SHAPE: torch.Size([16, 3, 256, 256])\n",
+      "No checkpoint found, starting from scratch.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "cuda-training:   0%|\u001b[31m                                                         \u001b[0m| 0/10 [00:00<?, ?it/s]\u001b[0m"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 0\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 1\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 2\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 3\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 4\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 5\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 6\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 7\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 8\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 9\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 10\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 11\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 12\n",
+      "Model IP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": []
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model OP\n",
+      "step: 13\n",
+      "Model IP\n",
+      "Model OP\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "cuda-batch:   0%|\u001b[32m                                               \u001b[0m| 14/8050 [00:06<1:02:22,  2.15it/s]\u001b[0m\n",
+      "cuda-training:   0%|\u001b[31m                                                         \u001b[0m| 0/10 [00:06<?, ?it/s]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "step: 14\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[34m__name__\u001b[39m == \u001b[33m\"\u001b[39m\u001b[33m__main__\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m     \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 30\u001b[39m, in \u001b[36mmain\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     27\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m'\u001b[39m\u001b[33mIMAGE SHAPE:\u001b[39m\u001b[33m'\u001b[39m, image.shape, \u001b[33m\"\u001b[39m\u001b[33mDataset len:\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mlen\u001b[39m(dataset))\n\u001b[32m     28\u001b[39m dataloader = create_dataloader(dataset, batch_size, debug=debug)\n\u001b[32m---> \u001b[39m\u001b[32m30\u001b[39m \u001b[43mtrain_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdataloader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43munet\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvae\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient_accumulation_steps\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msamples_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcheckpoint_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprojector\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 228\u001b[39m, in \u001b[36mtrain_loop\u001b[39m\u001b[34m(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector)\u001b[39m\n\u001b[32m    226\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m image \u001b[38;5;129;01min\u001b[39;00m tqdm(dataloader, colour=\u001b[33m'\u001b[39m\u001b[33mgreen\u001b[39m\u001b[33m'\u001b[39m, desc=\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdevice\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m-batch\u001b[39m\u001b[33m'\u001b[39m, ncols=\u001b[32m100\u001b[39m):\n\u001b[32m    227\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mstep:\u001b[39m\u001b[33m\"\u001b[39m, step)\n\u001b[32m--> \u001b[39m\u001b[32m228\u001b[39m     image = \u001b[43mimage\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfloat16\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    230\u001b[39m     latents = vae.encode(image).latent_dist.sample() * \u001b[32m0.18215\u001b[39m\n\u001b[32m    231\u001b[39m     noise = torch.randn_like(latents)\n",
+      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+     ]
+    }
+   ],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Vaani/SDFT/_2_.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import torch
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torchvision.transforms import v2
+from PIL import Image
+from diffusers import StableDiffusionPipeline
+from diffusers.optimization import get_scheduler
+from torch import nn
+import torch.nn.functional as F
+import os
+import pandas as pd
+from tqdm import trange, tqdm
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# audio_embed_dim = 1280
+# output_dim = 768
+# device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# context_projector = nn.Sequential(
+#     nn.Linear(audio_embed_dim, 320),
+#     nn.SiLU(),
+#     nn.Linear(320, output_dim)
+# ).to(device).half()
+# # Dummy input
+# audio_embedding = dummy_audio = torch.zeros(10, 1500, 1280, device=device, dtype=torch.float16)
+# print(audio_embedding.shape)  # [10, 1500, 1280]
+# # Project audio to [10, 1500, 768]
+# projected = context_projector(audio_embedding)
+# print(projected.shape)  # [10, 1500, 768]
+# # Compute attention scores: reduce feature dim to scalar per time step
+# attn_scores = projected.mean(dim=2)               # [10, 1500]
+# attn_weights = F.softmax(attn_scores, dim=1)      # [10, 1500]
+# attn_weights = attn_weights.unsqueeze(2)          # [10, 1500, 1]
+# # Weighted average
+# pooled = (projected * attn_weights).sum(dim=1, keepdim=True)  # [10, 1, 768]
+# print(pooled.shape)  # Final shape: [10, 1, 768]
+# === Helpers ===
+def walkDIR(folder_path, include=None):
+    file_list = []
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            if include is None or any(file.endswith(ext) for ext in include):
+                file_list.append(os.path.join(root, file))
+    print("Files found:", len(file_list))
+    return file_list
+# === Dataset Class ===
+class VaaniDataset(torch.utils.data.Dataset):
+    def __init__(self, files_paths, im_size):
+        self.files_paths = files_paths
+        self.im_size = im_size
+    def __len__(self):
+        return len(self.files_paths)
+    def __getitem__(self, idx):
+        # image = tv.io.read_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)
+        image = Image.open(self.files_paths[idx]).convert("RGB")
+        image = v2.ToImage()(image)
+        # image = tv.io.decode_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)
+        image = v2.Resize((self.im_size, self.im_size))(image)
+        image = v2.ToDtype(torch.float32, scale=True)(image)
+        # image = 2*image - 1
+        return image
+def create_dataloader(dataset, batch_size, debug=False, val_split=0.1, num_workers=4):
+    if debug:
+        s = 0.001
+        dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))
+        print("Length of Train dataset:", len(dataset))
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True,
+        persistent_workers=True
+    )
+    images = next(iter(train_dataloader))
+    print('Total Batches:', len(train_dataloader))
+    print('BATCH SHAPE:', images.shape)
+    return train_dataloader
+# === Audio Context Projector ===
+# class AudioContextProjector(nn.Module):
+#     def __init__(self, audio_embed_dim):
+#         super().__init__()
+#         self.audio_embed_dim = audio_embed_dim
+#         self.context_projector = nn.Sequential(
+#             nn.Linear(audio_embed_dim, 320),
+#             nn.SiLU(),
+#             nn.Linear(320, 1)
+#         )
+#     def forward(self, audio_embedding):
+#         if audio_embedding.size(-1) != self.audio_embed_dim:
+#           raise ValueError(f"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}")
+#         weights = self.context_projector(audio_embedding)  # [B, T, 1]
+#         weights = torch.softmax(weights, dim=1)            # [B, T, 1]
+#         pooled = (audio_embedding * weights).sum(dim=1)    # [B, 1280]
+#         return pooled.unsqueeze(1)                         # [B, 1, 1280]
+# class AudioContextProjector(nn.Module):
+#     def __init__(self, audio_embed_dim=1280, output_dim=768):  # Add output_dim for flexibility
+#         super().__init__()
+#         self.audio_embed_dim = audio_embed_dim
+#         self.context_projector = nn.Sequential(
+#             nn.Linear(audio_embed_dim, 320),
+#             nn.SiLU(),
+#             nn.Linear(320, output_dim)  # Output 768 to match UNet's expectation
+#         )
+#     def forward(self, audio_embedding):
+#         if audio_embedding.size(-1) != self.audio_embed_dim:
+#             raise ValueError(f"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}")
+#         weights = self.context_projector(audio_embedding)  # [B, T, 768]
+#         weights = torch.softmax(pooled, dim=1)            # [B, T, 768]
+#         pooled = (audio_embedding * weights).sum(dim=1)    # [B, 768]
+#         return pooled.unsqueeze(1)                         # [B, 1, 768]
+class AudioContextProjector(nn.Module):
+    def __init__(self, audio_embed_dim=1280, output_dim=768):
+        super().__init__()
+        self.audio_embed_dim = audio_embed_dim
+        self.output_dim = output_dim
+        self.context_projector = nn.Sequential(
+            nn.Linear(audio_embed_dim, 320),
+            nn.SiLU(),
+            nn.Linear(320, output_dim)  # Output 768 to match UNet's expectation
+        )
+    def forward(self, audio_embedding):
+        if audio_embedding.size(-1) != self.audio_embed_dim:
+            raise ValueError(f"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}")
+        # Project to [B, T, 768]
+        projected = self.context_projector(audio_embedding)  # [B, T, 768]
+        # Compute scalar attention scores per timestep
+        attn_scores = projected.mean(dim=2)                 # [B, T]
+        attn_weights = F.softmax(attn_scores, dim=1)        # [B, T]
+        attn_weights = attn_weights.unsqueeze(2)            # [B, T, 1]
+        # Apply attention to the projected embeddings
+        pooled = (projected * attn_weights).sum(dim=1, keepdim=True)  # [B, 1, 768]
+        return pooled
+# === Inference Function ===
+def run_inference(pipe, unet, vae, device, context_hidden_states, save_path="inference_output.png"):
+    pipe.unet = unet
+    pipe.vae = vae
+    pipe.to(device)
+    batch_size = 1
+    latents = torch.randn((batch_size, pipe.unet.in_channels, 64, 64), device=device, dtype=torch.float16)
+    # latents = torch.randn((batch_size, pipe.unet.config.in_channels, 64, 64), device=device, dtype=torch.float16)
+    pipe.scheduler.set_timesteps(50)
+    latents = latents * pipe.scheduler.init_noise_sigma
+    expected_shape = (batch_size, 1, 768)  # Adjust based on model
+    if context_hidden_states.shape != expected_shape:
+        raise ValueError(f"Expected context_hidden_states shape {expected_shape}, got {context_hidden_states.shape}")
+    for t in pipe.scheduler.timesteps:
+        with torch.no_grad():
+            noise_pred = pipe.unet(latents, t, encoder_hidden_states=context_hidden_states).sample
+        latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
+    # latents = 1 / 0.18215 * latents
+    latents = 1 / pipe.vae.config.scaling_factor * latents
+    with torch.no_grad():
+        image = pipe.vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+    image = Image.fromarray((image * 255).astype("uint8"))
+    image.save(save_path)
+    print(f"Inference image saved to {save_path}")
+# === Load Pipeline ===
+def load_pipeline(model_id, device):
+    pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
+    unet = pipe.unet
+    vae = pipe.vae
+    return pipe, unet, vae
+# === Freeze Layers Function ===
+def freeze_vae_layers(vae):
+    vae.encoder.requires_grad_(False)
+    vae.quant_conv.requires_grad_(False)
+    vae.decoder.requires_grad_(True)
+    vae.post_quant_conv.requires_grad_(True)
+def freeze_unet_layers(unet):
+    for name, param in unet.named_parameters():
+        if "attn2" in name or "conv2" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+# === Optimizer Setup ===
+def setup_optimizer(vae, unet, projector, lr):
+    params_to_optimize = list(filter(lambda p: p.requires_grad, vae.parameters())) + \
+                         list(filter(lambda p: p.requires_grad, unet.parameters())) + \
+                         list(filter(lambda p: p.requires_grad, projector.parameters()))
+    optimizer = optim.AdamW(params_to_optimize, lr=lr)
+    return optimizer
+# === Gradient Accumulation Function ===
+def accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader):
+    loss = loss / gradient_accumulation_steps
+    loss.backward()
+    if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(dataloader):
+        optimizer.step()
+        optimizer.zero_grad()
+# === Save Checkpoint Function ===
+def save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path):
+    # checkpoint_path = f"{save_dir}/checkpoint.pth"
+    torch.save({
+        'epoch': epoch,
+        'unet': unet.state_dict(),
+        'vae': vae.state_dict(),
+        'projector': projector.state_dict(),
+        'optimizer': optimizer.state_dict(),
+    }, checkpoint_path)
+    print(f"Checkpoint saved to {checkpoint_path}")
+# === Resume from Checkpoint Function ===
+def resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer):
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        unet.load_state_dict(checkpoint['unet'])
+        vae.load_state_dict(checkpoint['vae'])
+        projector.load_state_dict(checkpoint['projector'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        start_epoch = checkpoint['epoch'] + 1
+        print(f"Resuming training from epoch {start_epoch}...")
+        return start_epoch
+    else:
+        print("No checkpoint found, starting from scratch.")
+        return 0
+# === Training Loop Function ===
+def train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector):
+    start_epoch = resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer)
+    for epoch in trange(start_epoch, num_epochs, colour='red', desc=f'{device}-training', dynamic_ncols=True):
+        unet.train()
+        vae.train()
+        projector.train()
+        total_loss = 0
+        step = 0
+        for image in tqdm(dataloader, colour='green', desc=f'{device}-batch', dynamic_ncols=True):
+            # print("step:", step)
+            image = image.to(device, dtype=torch.float16)
+            latents = vae.encode(image).latent_dist.sample() * 0.18215
+            noise = torch.randn_like(latents)
+            # timesteps = torch.randint(0, 1000, (latents.shape[0],), device=device).long()
+            timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()
+            # === Use dummy audio embedding ===
+            dummy_audio = torch.zeros(image.size(0), 1500, 1280, device=device, dtype=torch.float16)
+            context_hidden_states = projector(dummy_audio)
+            # print("Model IP")
+            noise_pred = unet(latents + noise, timesteps, encoder_hidden_states=context_hidden_states).sample
+            # print("Model OP")
+            loss = nn.MSELoss()(noise_pred, noise)
+            total_loss += loss.item()
+            step += 1
+            accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader)
+        avg_loss = total_loss / len(dataloader)
+        print(f"Epoch {epoch + 1} | Avg Loss: {avg_loss:.6f}")
+        save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path)
+        run_inference(pipe, unet, vae, device, context_hidden_states, save_path=f"{samples_path}/inference_epoch{epoch + 1}.png")
+    print("\n✅ Fine-tuning complete.")
+# === Main Function ===
+def main():
+    model_id = "runwayml/stable-diffusion-v1-5"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    lr = 1e-5
+    num_epochs = 10
+    batch_size = 16
+    debug = False
+    gradient_accumulation_steps = 1
+    os.makedirs(f"./checkpoints", exist_ok=True)
+    os.makedirs(f"./samples", exist_ok=True)
+    checkpoint_path = f"./checkpoints/checkpoint.pth"
+    samples_path = f"./samples"
+    image_dir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
+    pipe, unet, vae = load_pipeline(model_id, device)
+    freeze_vae_layers(vae)
+    freeze_unet_layers(unet)
+    projector = AudioContextProjector(audio_embed_dim=1280, output_dim=768).to(device).half()
+    optimizer = setup_optimizer(vae, unet, projector, lr)
+    # === Dataset & Dataloader ===
+    files = walkDIR(image_dir, include=['.png', '.jpeg', '.jpg'])
+    dataset = VaaniDataset(files_paths=files, im_size=256)
+    image = dataset[2]
+    print('IMAGE SHAPE:', image.shape, "Dataset len:", len(dataset))
+    dataloader = create_dataloader(dataset, batch_size, debug=debug)
+    train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector)
+if __name__ == "__main__":
+    main()

Vaani/SDFT/_2_DDP.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torchvision.transforms import v2
+from PIL import Image
+from diffusers import StableDiffusionPipeline
+from diffusers.optimization import get_scheduler
+from torch import nn
+import torch.nn.functional as F
+import os
+import pandas as pd
+from tqdm import trange, tqdm
+# DDP Imports
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data.distributed import DistributedSampler
+import torch.multiprocessing as mp
+# Set CUDA_VISIBLE_DEVICES
+# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# === Helpers ===
+def walkDIR(folder_path, include=None):
+    file_list = []
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            if include is None or any(file.endswith(ext) for ext in include):
+                file_list.append(os.path.join(root, file))
+    print("Files found:", len(file_list))
+    return file_list
+# === Dataset Class ===
+class VaaniDataset(torch.utils.data.Dataset):
+    def __init__(self, files_paths, im_size):
+        self.files_paths = files_paths
+        self.im_size = im_size
+    def __len__(self):
+        return len(self.files_paths)
+    def __getitem__(self, idx):
+        image = Image.open(self.files_paths[idx]).convert("RGB")
+        image = v2.ToImage()(image)
+        image = v2.Resize((self.im_size, self.im_size))(image)
+        image = v2.ToDtype(torch.float32, scale=True)(image)
+        return image
+# === Modified create_dataloader for DDP and single GPU ===
+def create_dataloader(dataset, batch_size, debug=False, val_split=0.1, num_workers=4, rank=None, is_distributed=False):
+    if debug:
+        s = 0.001
+        dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))
+        print(f"{'Rank ' + str(rank) + ': ' if rank is not None else ''}Length of Train dataset: {len(dataset)}")
+    # Use DistributedSampler only if DDP is active
+    sampler = DistributedSampler(dataset, shuffle=True) if is_distributed else None
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=(sampler is None),
+        sampler=sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True,
+        persistent_workers=True
+    )
+    images = next(iter(train_dataloader))
+    if rank is not None:
+        print(f"Rank {rank}: Total Batches: {len(train_dataloader)}")
+        print(f"Rank {rank}: BATCH SHAPE: {images.shape}")
+    else:
+        print(f"Total Batches: {len(train_dataloader)}")
+        print(f"BATCH SHAPE: {images.shape}")
+    return train_dataloader
+# === Audio Context Projector ===
+class AudioContextProjector(nn.Module):
+    def __init__(self, audio_embed_dim=1280, output_dim=768):
+        super().__init__()
+        self.audio_embed_dim = audio_embed_dim
+        self.output_dim = output_dim
+        self.context_projector = nn.Sequential(
+            nn.Linear(audio_embed_dim, 320),
+            nn.SiLU(),
+            nn.Linear(320, output_dim)
+        )
+    def forward(self, audio_embedding):
+        if audio_embedding.size(-1) != self.audio_embed_dim:
+            raise ValueError(f"Expected audio embedding dim {self.audio_embed_dim}, got {audio_embedding.size(-1)}")
+        projected = self.context_projector(audio_embedding)
+        attn_scores = projected.mean(dim=2)
+        attn_weights = F.softmax(attn_scores, dim=1)
+        attn_weights = attn_weights.unsqueeze(2)
+        pooled = (projected * attn_weights).sum(dim=1, keepdim=True)
+        return pooled
+# === Inference Function ===
+def run_inference(pipe, unet, vae, device, context_hidden_states, save_path="inference_output.png", rank=0):
+    if rank != 0:  # Only rank-0 or single-GPU process runs inference
+        return
+    pipe.unet = unet.module if isinstance(unet, DDP) else unet
+    pipe.vae = vae.module if isinstance(vae, DDP) else vae
+    pipe.to(device)
+    batch_size = 1
+    latents = torch.randn((batch_size, pipe.unet.in_channels, 64, 64), device=device, dtype=torch.float16)
+    pipe.scheduler.set_timesteps(50)
+    latents = latents * pipe.scheduler.init_noise_sigma
+    expected_shape = (batch_size, 1, 768)
+    if context_hidden_states.shape != expected_shape:
+        raise ValueError(f"Expected context_hidden_states shape {expected_shape}, got {context_hidden_states.shape}")
+    for t in pipe.scheduler.timesteps:
+        with torch.no_grad():
+            noise_pred = pipe.unet(latents, t, encoder_hidden_states=context_hidden_states).sample
+        latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
+    latents = 1 / pipe.vae.config.scaling_factor * latents
+    with torch.no_grad():
+        image = pipe.vae.decode(latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+    image = Image.fromarray((image * 255).astype("uint8"))
+    image.save(save_path)
+    print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}Inference image saved to {save_path}")
+# === Load Pipeline ===
+def load_pipeline(model_id, device):
+    pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
+    unet = pipe.unet
+    vae = pipe.vae
+    return pipe, unet, vae
+# === Freeze Layers Function ===
+def freeze_vae_layers(vae):
+    vae.encoder.requires_grad_(False)
+    vae.quant_conv.requires_grad_(False)
+    vae.decoder.requires_grad_(True)
+    vae.post_quant_conv.requires_grad_(True)
+def freeze_unet_layers(unet):
+    for name, param in unet.named_parameters():
+        if "attn2" in name or "conv2" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+# === Optimizer Setup ===
+def setup_optimizer(vae, unet, projector, lr):
+    params_to_optimize = list(filter(lambda p: p.requires_grad, vae.parameters())) + \
+                         list(filter(lambda p: p.requires_grad, unet.parameters())) + \
+                         list(filter(lambda p: p.requires_grad, projector.parameters()))
+    optimizer = optim.AdamW(params_to_optimize, lr=lr)
+    return optimizer
+# === Gradient Accumulation Function ===
+def accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader):
+    loss = loss / gradient_accumulation_steps
+    loss.backward()
+    if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(dataloader):
+        optimizer.step()
+        optimizer.zero_grad()
+# === Save Checkpoint Function ===
+def save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path, rank=0):
+    if rank != 0:  # Only rank-0 or single-GPU process saves checkpoint
+        return
+    torch.save({
+        'epoch': epoch,
+        'unet': unet.module.state_dict() if isinstance(unet, DDP) else unet.state_dict(),
+        'vae': vae.module.state_dict() if isinstance(vae, DDP) else vae.state_dict(),
+        'projector': projector.module.state_dict() if isinstance(projector, DDP) else projector.state_dict(),
+        'optimizer': optimizer.state_dict(),
+    }, checkpoint_path)
+    print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}Checkpoint saved to {checkpoint_path}")
+# === Resume from Checkpoint Function ===
+def resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer, rank=0):
+    if rank != 0:  # Only rank-0 or single-GPU process loads checkpoint
+        return 0
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        unet.load_state_dict(checkpoint['unet'])
+        vae.load_state_dict(checkpoint['vae'])
+        projector.load_state_dict(checkpoint['projector'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        start_epoch = checkpoint['epoch'] + 1
+        print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}Resuming training from epoch {start_epoch}...")
+        return start_epoch
+    else:
+        print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}No checkpoint found, starting from scratch.")
+        return 0
+# === Training Loop Function ===
+def train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs, samples_path, checkpoint_path, pipe, projector, rank=0, is_distributed=False):
+    start_epoch = resume_from_checkpoint(checkpoint_path, unet, vae, projector, optimizer, rank)
+    for epoch in trange(start_epoch, num_epochs, colour='red', desc=f"{'Rank ' + str(rank) + ' ' if rank != 0 else ''}{device}-training", dynamic_ncols=True):
+        unet.train()
+        vae.train()
+        projector.train()
+        total_loss = 0
+        step = 0
+        # Reset sampler for each epoch if using DistributedSampler
+        if is_distributed and isinstance(dataloader.sampler, DistributedSampler):
+            dataloader.sampler.set_epoch(epoch)
+        for image in tqdm(dataloader, colour='green', desc=f"{'Rank ' + str(rank) + ' ' if rank != 0 else ''}{device}-batch", dynamic_ncols=True):
+            image = image.to(device, dtype=torch.float16)
+            latents = vae.encode(image).latent_dist.sample() * 0.18215
+            noise = torch.randn_like(latents)
+            timesteps = torch.randint(0, pipe.scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()
+            dummy_audio = torch.zeros(image.size(0), 1500, 1280, device=device, dtype=torch.float16)
+            context_hidden_states = projector(dummy_audio)
+            noise_pred = unet(latents + noise, timesteps, encoder_hidden_states=context_hidden_states).sample
+            loss = nn.MSELoss()(noise_pred, noise)
+            total_loss += loss.item()
+            step += 1
+            accumulate_gradients(optimizer, loss, gradient_accumulation_steps, step, dataloader)
+        # Aggregate loss for DDP
+        if is_distributed:
+            total_loss_tensor = torch.tensor(total_loss, device=device)
+            dist.all_reduce(total_loss_tensor, op=dist.ReduceOp.SUM)
+            avg_loss = total_loss_tensor.item() / (len(dataloader) * dist.get_world_size())
+        else:
+            avg_loss = total_loss / len(dataloader)
+        if rank == 0:
+            print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}Epoch {epoch + 1} | Avg Loss: {avg_loss:.6f}")
+        save_checkpoint(epoch, unet, vae, projector, optimizer, checkpoint_path, rank)
+        run_inference(pipe, unet, vae, device, context_hidden_states,
+                      save_path=f"{samples_path}/inference_epoch{epoch + 1}{'_rank' + str(rank) if rank != 0 else ''}.png",
+                      rank=rank)
+    if rank == 0:
+        print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}✅ Fine-tuning complete.")
+# === DDP Setup Function ===
+def setup_ddp(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+# === Main Function ===
+def main(rank=0, world_size=1, is_distributed=False):
+    if is_distributed:
+        setup_ddp(rank, world_size)
+        device = torch.device(f"cuda:{rank}")
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_id = "runwayml/stable-diffusion-v1-5"
+    lr = 1e-5
+    num_epochs = 10
+    batch_size = 16
+    debug = False
+    gradient_accumulation_steps = 1
+    if rank == 0:
+        os.makedirs(f"./checkpoints", exist_ok=True)
+        os.makedirs(f"./samples", exist_ok=True)
+    if is_distributed:
+        dist.barrier()
+    checkpoint_path = f"./checkpoints/checkpoint{'_rank' + str(rank) if is_distributed else ''}.pth"
+    samples_path = f"./samples"
+    image_dir = "/scratch/IITB/ai-at-ieor/23m1521/datasets/Vaani/Images"
+    pipe, unet, vae = load_pipeline(model_id, device)
+    freeze_vae_layers(vae)
+    freeze_unet_layers(unet)
+    projector = AudioContextProjector(audio_embed_dim=1280, output_dim=768).to(device).half()
+    if is_distributed:
+        unet = DDP(unet, device_ids=[rank])
+        vae = DDP(vae, device_ids=[rank])
+        projector = DDP(projector, device_ids=[rank])
+    optimizer = setup_optimizer(vae, unet, projector, lr)
+    files = walkDIR(image_dir, include=['.png', '.jpeg', '.jpg'])
+    dataset = VaaniDataset(files_paths=files, im_size=256)
+    if rank == 0:
+        image = dataset[2]
+        print(f"{'Rank ' + str(rank) + ': ' if rank != 0 else ''}IMAGE SHAPE: {image.shape}, Dataset len: {len(dataset)}")
+    dataloader = create_dataloader(dataset, batch_size, debug=debug, rank=rank, is_distributed=is_distributed)
+    train_loop(dataloader, unet, vae, optimizer, gradient_accumulation_steps, device, num_epochs,
+               samples_path, checkpoint_path, pipe, projector, rank, is_distributed)
+    if is_distributed:
+        dist.destroy_process_group()
+# === Entry Point ===
+if __name__ == "__main__":
+    world_size = torch.cuda.device_count()
+    print(f"Detected {world_size} GPU(s)")
+    if world_size > 1:
+        mp.spawn(main, args=(world_size, True), nprocs=world_size, join=True)
+    else:
+        main(rank=0, world_size=1, is_distributed=False)

Vaani/SDFT/checkpoints/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79469e5ae61b7894df2b96cdb09b873f9d0e2282f8b85d4195c5dbd16e182891
+size 2866661866

Vaani/SDFT/download_model.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from diffusers import StableDiffusionPipeline, UNet2DConditionModel, StableDiffusion3Pipeline
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("device:", device)
+# pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+# pipe
+# del pipe
+pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium", torch_dtype=torch.bfloat16)
+pipe
+# del pipe

Vaani/SDFT/vaani-stablediffusion-finetune-kaggle.ipynb ADDED Viewed

	@@ -0,0 +1,650 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from transformers import CLIPTextModel, CLIPTokenizer\n",
+    "from diffusers import StableDiffusionPipeline, UNet2DConditionModel\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "from accelerate import Accelerator\n",
+    "import torchaudio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Couldn't connect to the Hub: (MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/models/runwayml/stable-diffusion-v1-5 (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x7f99cc2c77d0>: Failed to resolve \\'huggingface.co\\' ([Errno -2] Name or service not known)\"))'), '(Request ID: 85a7f948-b1d1-4bb4-be97-0eaea2bfd0f8)').\n",
+      "Will try to load from local cache.\n",
+      "Loading pipeline components...: 100%|██████████| 7/7 [00:43<00:00,  6.22s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "pipe = StableDiffusionPipeline.from_pretrained(\"runwayml/stable-diffusion-v1-5\", torch_dtype=torch.float16).to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "unet = pipe.unet\n",
+    "vae = pipe.vae\n",
+    "tokenizer = pipe.tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# Your text prompt\n",
+    "prompt = \"a photo of an astronaut riding a horse on mars\"\n",
+    "\n",
+    "# Generate image\n",
+    "with torch.autocast(\"cuda\"):\n",
+    "    image = pipe(prompt).images[0]\n",
+    "\n",
+    "# Show or save the result\n",
+    "image.show()  # Opens in default image viewer\n",
+    "image.save(\"astronaut_horse_mars.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<hr style=\"height:4px;border:none;color:#ff0000;background-color:#ff0000;\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:30:58.653987Z",
+     "iopub.status.busy": "2025-05-14T14:30:58.653745Z",
+     "iopub.status.idle": "2025-05-14T14:30:58.658276Z",
+     "shell.execute_reply": "2025-05-14T14:30:58.657649Z",
+     "shell.execute_reply.started": "2025-05-14T14:30:58.653970Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torchvision import transforms\n",
+    "from torchvision.transforms import v2\n",
+    "from PIL import Image\n",
+    "from diffusers import StableDiffusionPipeline\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "from accelerate import Accelerator\n",
+    "from torch import nn\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import trange, tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:30:58.659588Z",
+     "iopub.status.busy": "2025-05-14T14:30:58.658976Z",
+     "iopub.status.idle": "2025-05-14T14:31:23.063776Z",
+     "shell.execute_reply": "2025-05-14T14:31:23.063145Z",
+     "shell.execute_reply.started": "2025-05-14T14:30:58.659571Z"
+    },
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files found: 128807\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>image_path</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128802</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128803</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128804</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128805</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>128806</th>\n",
+       "      <td>/kaggle/input/vaani-images-tar/Images/IISc_Vaa...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>128807 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               image_path\n",
+       "0       /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "1       /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "2       /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "3       /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "4       /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "...                                                   ...\n",
+       "128802  /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "128803  /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "128804  /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "128805  /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "128806  /kaggle/input/vaani-images-tar/Images/IISc_Vaa...\n",
+       "\n",
+       "[128807 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "IMAGES_PATH = r\"/kaggle/input/vaani-images-tar/Images\"\n",
+    "\n",
+    "def walkDIR(folder_path, include=None):\n",
+    "    file_list = []\n",
+    "    for root, _, files in os.walk(folder_path):\n",
+    "        for file in files:\n",
+    "            if include is None or any(file.endswith(ext) for ext in include):\n",
+    "                file_list.append(os.path.join(root, file))\n",
+    "    print(\"Files found:\", len(file_list))\n",
+    "    return file_list\n",
+    "\n",
+    "files = walkDIR(IMAGES_PATH, include=['.png', '.jpeg', '.jpg'])\n",
+    "df = pd.DataFrame(files, columns=['image_path'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:23.065017Z",
+     "iopub.status.busy": "2025-05-14T14:31:23.064553Z",
+     "iopub.status.idle": "2025-05-14T14:31:23.086417Z",
+     "shell.execute_reply": "2025-05-14T14:31:23.085628Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:23.064991Z"
+    },
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "IMAGE SHAPE: torch.Size([3, 256, 256])\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "128807"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "class VaaniDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, files_paths, im_size):\n",
+    "        self.files_paths = files_paths\n",
+    "        self.im_size = im_size\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.files_paths)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        # image = tv.io.read_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)\n",
+    "        image = Image.open(self.files_paths[idx]).convert(\"RGB\")\n",
+    "        image = v2.ToImage()(image)\n",
+    "        # image = tv.io.decode_image(self.files_paths[idx], mode=tv.io.ImageReadMode.RGB)\n",
+    "        image = v2.Resize((self.im_size, self.im_size))(image)\n",
+    "        image = v2.ToDtype(torch.float32, scale=True)(image)\n",
+    "        # image = 2*image - 1\n",
+    "        return image\n",
+    "\n",
+    "dataset = VaaniDataset(files_paths=files, im_size=256)\n",
+    "image = dataset[2]\n",
+    "print('IMAGE SHAPE:', image.shape)\n",
+    "len(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:23.087483Z",
+     "iopub.status.busy": "2025-05-14T14:31:23.087211Z",
+     "iopub.status.idle": "2025-05-14T14:31:23.468810Z",
+     "shell.execute_reply": "2025-05-14T14:31:23.465992Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:23.087458Z"
+    },
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Length of Train dataset: 129\n",
+      "BATCH SHAPE: torch.Size([2, 3, 256, 256])\n"
+     ]
+    }
+   ],
+   "source": [
+    "debug = True\n",
+    "\n",
+    "if debug:\n",
+    "    s = 0.001\n",
+    "    dataset, _ = torch.utils.data.random_split(dataset, [s, 1-s], torch.manual_seed(42))\n",
+    "    print(\"Length of Train dataset:\", len(dataset))\n",
+    "\n",
+    "BATCH_SIZE = 2\n",
+    "\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset, \n",
+    "    batch_size=BATCH_SIZE, \n",
+    "    shuffle=True, \n",
+    "    num_workers=4,\n",
+    "    pin_memory=True,\n",
+    "    drop_last=True,\n",
+    "    persistent_workers=True\n",
+    ")\n",
+    "\n",
+    "images = next(iter(dataloader))\n",
+    "print('BATCH SHAPE:', images.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:59.796334Z",
+     "iopub.status.busy": "2025-05-14T14:31:59.795660Z",
+     "iopub.status.idle": "2025-05-14T14:31:59.800889Z",
+     "shell.execute_reply": "2025-05-14T14:31:59.800295Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:59.796311Z"
+    },
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "64"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dataloader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:23.470858Z",
+     "iopub.status.busy": "2025-05-14T14:31:23.470503Z",
+     "iopub.status.idle": "2025-05-14T14:31:28.213003Z",
+     "shell.execute_reply": "2025-05-14T14:31:28.212168Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:23.470801Z"
+    },
+    "scrolled": true,
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28c0c220b2cf45968b4abdecf3936bc9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Load pretrained Stable Diffusion\n",
+    "pipe = StableDiffusionPipeline.from_pretrained(\n",
+    "    \"runwayml/stable-diffusion-v1-5\", \n",
+    "    torch_dtype=torch.float16\n",
+    ").to(\"cuda\")\n",
+    "\n",
+    "unet = pipe.unet\n",
+    "vae = pipe.vae\n",
+    "scheduler = pipe.scheduler"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:39.847880Z",
+     "iopub.status.busy": "2025-05-14T14:31:39.847601Z",
+     "iopub.status.idle": "2025-05-14T14:31:39.868068Z",
+     "shell.execute_reply": "2025-05-14T14:31:39.867331Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:39.847863Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# Optimizer and scheduler\n",
+    "optimizer = torch.optim.AdamW(unet.parameters(), lr=1e-5)\n",
+    "lr_scheduler = get_scheduler(\"linear\", optimizer=optimizer, num_warmup_steps=100, num_training_steps=1000)\n",
+    "\n",
+    "\n",
+    "accelerator = Accelerator()\n",
+    "unet, optimizer, dataloader = accelerator.prepare(unet, optimizer, dataloader)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:31:42.759171Z",
+     "iopub.status.busy": "2025-05-14T14:31:42.758886Z",
+     "iopub.status.idle": "2025-05-14T14:31:42.763012Z",
+     "shell.execute_reply": "2025-05-14T14:31:42.762387Z",
+     "shell.execute_reply.started": "2025-05-14T14:31:42.759152Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:40:23.644302Z",
+     "iopub.status.busy": "2025-05-14T14:40:23.643598Z",
+     "iopub.status.idle": "2025-05-14T14:40:23.648831Z",
+     "shell.execute_reply": "2025-05-14T14:40:23.648151Z",
+     "shell.execute_reply.started": "2025-05-14T14:40:23.644244Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "EPOCHS = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-14T14:40:35.244187Z",
+     "iopub.status.busy": "2025-05-14T14:40:35.243686Z"
+    },
+    "scrolled": true,
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 84%|\u001b[32m████████████████████████████████████████████████████          \u001b[0m| 84/100 [39:16<07:28, 28.02s/it]\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "# Start training loop\n",
+    "for epoch in trange(EPOCHS, ncols=100, colour='green'):\n",
+    "    for step, images in enumerate(dataloader):\n",
+    "        images = images.to(device, dtype=torch.float16)\n",
+    "\n",
+    "        # Encode images to latents\n",
+    "        latents = vae.encode(images).latent_dist.sample()\n",
+    "        latents = latents * 0.18215\n",
+    "\n",
+    "        # Sample noise and timesteps\n",
+    "        noise = torch.randn_like(latents)\n",
+    "        timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()\n",
+    "        noisy_latents = scheduler.add_noise(latents, noise, timesteps)\n",
+    "\n",
+    "        # Use zeroed audio embedding (like a null conditioning vector)\n",
+    "        batch_size = images.shape[0]\n",
+    "        cond_dim = pipe.text_encoder.config.hidden_size  # 768 for SD 1.5\n",
+    "        null_emb = torch.zeros((batch_size, 77, cond_dim), device=device, dtype=torch.float16)\n",
+    "\n",
+    "        # Predict noise\n",
+    "        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states=null_emb).sample\n",
+    "\n",
+    "        # Loss and backward\n",
+    "        loss = nn.MSELoss()(noise_pred, noise)\n",
+    "        accelerator.backward(loss)\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "        optimizer.zero_grad()\n",
+    "\n",
+    "        # if step % 10 == 0:\n",
+    "        #     print(f\"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers import StableDiffusionPipeline, DDIMScheduler\n",
+    "from PIL import Image\n",
+    "\n",
+    "# Load pretrained (or fine-tuned) Stable Diffusion\n",
+    "pipe = StableDiffusionPipeline.from_pretrained(\n",
+    "    \"runwayml/stable-diffusion-v1-5\",\n",
+    "    torch_dtype=torch.float16,\n",
+    ")\n",
+    "pipe.to(\"cuda\")\n",
+    "\n",
+    "# Optionally load fine-tuned weights\n",
+    "pipe.unet.load_state_dict(torch.load(\"path/to/fine_tuned_unet.pth\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# Prepare dummy zero embedding\n",
+    "batch_size = 1\n",
+    "seq_len = 77  # number of tokens (CLIP text length)\n",
+    "embed_dim = pipe.text_encoder.config.hidden_size  # 768 for CLIP\n",
+    "null_emb = torch.zeros((batch_size, seq_len, embed_dim), device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "# Sample initial noise\n",
+    "latents = torch.randn((batch_size, pipe.unet.in_channels, 64, 64), device=\"cuda\", dtype=torch.float16)\n",
+    "\n",
+    "# Use DDIM or default scheduler\n",
+    "pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Denoising loop\n",
+    "num_inference_steps = 50\n",
+    "pipe.scheduler.set_timesteps(num_inference_steps)\n",
+    "latents = latents * pipe.scheduler.init_noise_sigma\n",
+    "\n",
+    "for t in pipe.scheduler.timesteps:\n",
+    "    # Predict noise using zero embedding\n",
+    "    with torch.no_grad():\n",
+    "        noise_pred = pipe.unet(latents, t, encoder_hidden_states=null_emb).sample\n",
+    "\n",
+    "    # Compute the previous noisy sample\n",
+    "    latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample\n",
+    "\n",
+    "# Decode latents to image\n",
+    "latents = 1 / 0.18215 * latents\n",
+    "with torch.no_grad():\n",
+    "    image = pipe.vae.decode(latents).sample\n",
+    "\n",
+    "# Convert to PIL\n",
+    "image = (image / 2 + 0.5).clamp(0, 1)\n",
+    "image = image.cpu().permute(0, 2, 3, 1).numpy()[0]\n",
+    "image = Image.fromarray((image * 255).astype(\"uint8\"))\n",
+    "\n",
+    "# Save or show image\n",
+    "image.save(\"zero_condition_output.png\")\n",
+    "image.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "datasetId": 6964433,
+     "sourceId": 11161218,
+     "sourceType": "datasetVersion"
+    }
+   ],
+   "dockerImageVersionId": 31041,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Vaani/VaaniLDM/ddpm_ckpt_epoch31.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:339fa2458d3ead55689b3e219be0223ddc515874a4c03bb67bce527527076073
+size 593243562

Vaani/VaaniLDM/ddpm_ckpt_epoch32.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32a6ea3e6f6558014f9eb11da6263abf02f130fdd77643889cf088f6d7077359
+size 593243626

Vaani/VaaniLDM/ldmH_ckpt_epoch24.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0abb3e97bfae10d4689aeeed14bd6fcd48472be5729870a4d179a74ff67982c7
+size 2476368170

Vaani/VaaniLDM/ldmH_ckpt_epoch25.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efd5c35e1948cb47ae74cf4a09e43cf6428d623bb3bb3bec0594057a195b7953
+size 2476368234

Vaani/VaaniLDM/samples/x0_0.png CHANGED Viewed

Git LFS Details

SHA256: 5051b0c57b98915bbd30f8f413daa87c96e3bc117dd72adf55fee55c33d75516
Pointer size: 131 Bytes
Size of remote file: 421 kB

Git LFS Details

SHA256: a40423339bf5053f537333df664a4945fb160181e05ae139856e2b67cd77cc16
Pointer size: 131 Bytes
Size of remote file: 426 kB

Vaani/VaaniLDM/samples/x0_1.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_10.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_100.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_101.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_102.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_103.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_104.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_105.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_106.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_107.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_108.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_109.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_11.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_110.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_111.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_112.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_113.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_114.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_115.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_116.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_117.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_118.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_119.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_12.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_120.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_121.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_122.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_123.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_124.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_125.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_126.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_127.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_128.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_129.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_13.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_130.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_131.png CHANGED Viewed

Vaani/VaaniLDM/samples/x0_132.png CHANGED Viewed