Spaces:

rahul7star
/

Image2Video

Paused

App Files Files Community

rahul7star commited on 21 days ago

Commit

66aa19e

verified ·

1 Parent(s): 2e8da0d

Update app_quant_latent.py

Browse files

Files changed (1) hide show

app_quant_latent.py +98 -42

app_quant_latent.py CHANGED Viewed

@@ -247,55 +247,111 @@ log_system_stats("AFTER PIPELINE BUILD")
 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
-    global latent_history
-    latent_history = []   # reset every run
-    generator = torch.Generator("cuda").manual_seed(int(seed))
-    logs = []
-    def log(msg):
-        logs.append(msg)
-    # Run pipeline manually step by step
-    out = pipe(
-        prompt=prompt,
-        height=height,
-        width=width,
-        num_inference_steps=steps,
-        generator=generator,
-        output_type="latent"
-    )
-    latents = out.latents
-    # Denoising loop - MANUAL callback
-    for i, t in enumerate(pipe.scheduler.timesteps):
-        latents = pipe.unet(latents, t, encoder_hidden_states=out.prompt_embeds).sample
-        # Store cloned latent
-        latent_history.append(latents.detach().cpu().clone())
-        # Log GPU memory
-        gpu = torch.cuda.memory_allocated() / 1e9
-        log(f"Step {i+1}/{steps} — GPU: {gpu:.2f} GB")
-        # Step scheduler
-        latents = pipe.scheduler.step(latents, timestep=t).prev_sample
-    # Decode final image
-    final_image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor).sample[0]
-    final_image = (final_image / 2 + 0.5).clamp(0,1).cpu().permute(1,2,0).numpy()
-    # Convert latents to preview images
-    latent_imgs = []
-    for l in latent_history:
-        img = pipe.vae.decode(l / pipe.vae.config.scaling_factor).sample[0]
-        img = (img / 2 + 0.5).clamp(0,1).cpu().permute(1,2,0).numpy()
-        latent_imgs.append(img)
-    return final_image, latent_imgs, "\n".join(logs)
 # ============================================================

 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
+ try:
+    # -----------------------------
+    # 1) SEED + LATENT INIT
+    # -----------------------------
+    generator = torch.Generator("cuda").manual_seed(seed)
+    # Unet input size = (B, C, H/8, W/8)
+    latent_shape = (
+        1,
+        pipe.unet.config.in_channels,
+        height // 8,
+        width // 8
+    )
+    latents = torch.randn(latent_shape, generator=generator, device="cuda")
+    latents = latents * pipe.scheduler.init_noise_sigma
+    latent_history = []
+    log(f"Latent shape: {latent_shape}")
+    # -----------------------------
+    # 2) Text Embeddings
+    # -----------------------------
+    text_inputs = pipe.tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        max_length=pipe.tokenizer.model_max_length,
+    ).to("cuda")
+    text_embeddings = pipe.text_encoder(text_inputs.input_ids)[0]
+    # -----------------------------
+    # 3) Scheduler timesteps
+    # -----------------------------
+    pipe.scheduler.set_timesteps(steps, device="cuda")
+    timesteps = pipe.scheduler.timesteps
+    # -----------------------------
+    # 4) MANUAL DIFFUSION LOOP
+    # -----------------------------
+    for i, t in enumerate(timesteps):
+        with torch.no_grad():
+            # Forward UNET
+            noise_pred = pipe.unet(
+                latents,
+                t,
+                encoder_hidden_states=text_embeddings
+            ).sample
+            # Save latent copy
+            latent_history.append(
+                latents.detach().clone().to("cpu")
+            )
+            # Log GPU
+            gpu_gb = torch.cuda.memory_allocated() / 1e9
+            log(f"Step {i+1}/{steps} | t={int(t)} | GPU={gpu_gb:.2f} GB")
+            # Scheduler update
+            latents = pipe.scheduler.step(
+                noise_pred,
+                t,
+                latents
+            ).prev_sample
+    # -----------------------------
+    # 5) FINAL DECODE (VAE)
+    # -----------------------------
+    with torch.no_grad():
+        latents_final = latents / pipe.vae.config.scaling_factor
+        image = pipe.vae.decode(latents_final).sample[0]
+    # Convert to PIL
+    final_image = pipe.image_processor.postprocess(
+        image.unsqueeze(0),
+        output_type="pil"
+    )[0]
+    log("✅ Inference finished.")
+    log_system_stats("AFTER INFERENCE")
+    # -----------------------------
+    # Convert latent_history to images for gallery
+    # -----------------------------
+    latent_imgs = []
+    for lat in latent_history:
+        # Normalize each latent step into a displayable grayscale image
+        lat_img = lat[0, 0].cpu().numpy()
+        lat_img = (lat_img - lat_img.min()) / (lat_img.max() - lat_img.min() + 1e-8)
+        lat_img = (lat_img * 255).astype("uint8")
+        latent_imgs.append(Image.fromarray(lat_img))
+    return final_image, latent_imgs, LOGS
+ except Exception as e:
+    log(f"❌ Inference error: {e}")
+    return None, None, LOGS
 # ============================================================