Spaces:

rahul7star
/

Image2Video

Paused

App Files Files Community

rahul7star commited on 21 days ago

Commit

cbb491e

verified ·

1 Parent(s): 66aa19e

Update app_quant_latent.py

Browse files

Files changed (1) hide show

app_quant_latent.py +70 -100

app_quant_latent.py CHANGED Viewed

@@ -250,107 +250,77 @@ log_system_stats("AFTER PIPELINE BUILD")
 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
- try:
-    # -----------------------------
-    # 1) SEED + LATENT INIT
-    # -----------------------------
-    generator = torch.Generator("cuda").manual_seed(seed)
-    # Unet input size = (B, C, H/8, W/8)
-    latent_shape = (
-        1,
-        pipe.unet.config.in_channels,
-        height // 8,
-        width // 8
-    )
-    latents = torch.randn(latent_shape, generator=generator, device="cuda")
-    latents = latents * pipe.scheduler.init_noise_sigma
-    latent_history = []
-    log(f"Latent shape: {latent_shape}")
-    # -----------------------------
-    # 2) Text Embeddings
-    # -----------------------------
-    text_inputs = pipe.tokenizer(
-        prompt,
-        return_tensors="pt",
-        padding="max_length",
-        truncation=True,
-        max_length=pipe.tokenizer.model_max_length,
-    ).to("cuda")
-    text_embeddings = pipe.text_encoder(text_inputs.input_ids)[0]
-    # -----------------------------
-    # 3) Scheduler timesteps
-    # -----------------------------
-    pipe.scheduler.set_timesteps(steps, device="cuda")
-    timesteps = pipe.scheduler.timesteps
-    # -----------------------------
-    # 4) MANUAL DIFFUSION LOOP
-    # -----------------------------
-    for i, t in enumerate(timesteps):
-        with torch.no_grad():
-            # Forward UNET
-            noise_pred = pipe.unet(
-                latents,
-                t,
-                encoder_hidden_states=text_embeddings
-            ).sample
-            # Save latent copy
-            latent_history.append(
-                latents.detach().clone().to("cpu")
-            )
-            # Log GPU
-            gpu_gb = torch.cuda.memory_allocated() / 1e9
-            log(f"Step {i+1}/{steps} | t={int(t)} | GPU={gpu_gb:.2f} GB")
-            # Scheduler update
-            latents = pipe.scheduler.step(
-                noise_pred,
-                t,
-                latents
-            ).prev_sample
-    # -----------------------------
-    # 5) FINAL DECODE (VAE)
-    # -----------------------------
-    with torch.no_grad():
-        latents_final = latents / pipe.vae.config.scaling_factor
-        image = pipe.vae.decode(latents_final).sample[0]
-    # Convert to PIL
-    final_image = pipe.image_processor.postprocess(
-        image.unsqueeze(0),
-        output_type="pil"
-    )[0]
-    log("✅ Inference finished.")
-    log_system_stats("AFTER INFERENCE")
-    # -----------------------------
-    # Convert latent_history to images for gallery
-    # -----------------------------
-    latent_imgs = []
-    for lat in latent_history:
-        # Normalize each latent step into a displayable grayscale image
-        lat_img = lat[0, 0].cpu().numpy()
-        lat_img = (lat_img - lat_img.min()) / (lat_img.max() - lat_img.min() + 1e-8)
-        lat_img = (lat_img * 255).astype("uint8")
-        latent_imgs.append(Image.fromarray(lat_img))
-    return final_image, latent_imgs, LOGS
- except Exception as e:
-    log(f"❌ Inference error: {e}")
-    return None, None, LOGS

 @spaces.GPU
 def generate_image(prompt, height, width, steps, seed):
+    try:
+        generator = torch.Generator(device).manual_seed(int(seed))
+        latent_history = []
+        # callback signature expected by ZImagePipeline:
+        # callback_on_step_end(self_pipeline, step_index, timestep, callback_kwargs_dict)
+        def save_latents(self_pipeline, step_idx, timestep, callback_kwargs):
+            # callback_kwargs contains tensor inputs specified by
+            # callback_on_step_end_tensor_inputs (defaults to ["latents"])
+            try:
+                lat = callback_kwargs.get("latents", None)
+                if lat is not None:
+                    # store CPU copy to avoid holding GPU memory
+                    latent_history.append(lat.detach().clone().cpu())
+                # we must return a dict (may include overrides), here no overrides:
+                return {}
+            except Exception as e:
+                log(f"⚠️ save_latents error: {e}")
+                return {}
+        # Run pipeline once, using the pipeline's callback mechanism
+        out = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            guidance_scale=0.0,
+            generator=generator,
+            callback_on_step_end=save_latents,
+            callback_on_step_end_tensor_inputs=["latents"],  # ensure latents passed to callback
+        )
+        # out is a ZImagePipelineOutput; pipeline already postprocessed images
+        final_image = out.images[0] if hasattr(out, "images") and len(out.images) > 0 else out
+        # Convert saved latents into displayable images (use same postprocessing as pipeline)
+        latent_images = []
+        try:
+            # Determine decode device and dtype
+            vae = pipe.vae
+            img_proc = pipe.image_processor
+            vae_device = vae.device if hasattr(vae, "device") else device
+            for i, lat_cpu in enumerate(latent_history):
+                try:
+                    # move to vae device and dtype
+                    lat = lat_cpu.to(vae_device).to(vae.dtype)
+                    # pipeline used this transform before decoding:
+                    lat = (lat / vae.config.scaling_factor) + getattr(vae.config, "shift_factor", 0.0)
+                    # decode: vae.decode returns (batch, C, H, W)
+                    img_tensor = vae.decode(lat, return_dict=False)[0]
+                    # postprocess with pipeline's image processor to PIL
+                    pil = img_proc.postprocess(img_tensor.unsqueeze(0), output_type="pil")[0]
+                    latent_images.append(pil)
+                except Exception as e:
+                    log(f"⚠️ Failed to decode latent step {i}: {e}")
+        except Exception as e:
+            log(f"⚠️ Error while converting latents: {e}")
+        log("✅ Inference finished.")
+        log_system_stats("AFTER INFERENCE")
+        return final_image, latent_images, LOGS
+    except Exception as e:
+        log(f"❌ Inference error: {e}")
+        return None, [], LOGS