Spaces:

MilindChawre
/

stable-diffusion-using-text-inversion

Sleeping

App Files Files Community

MilindChawre commited on Mar 21, 2025

Commit

6304c5b

1 Parent(s): 45b110b

Modifying the app code

Browse files

Files changed (1) hide show

app.py +127 -83

app.py CHANGED Viewed

@@ -67,6 +67,12 @@ def image_loss(images, loss_type, device, elastic_transformer):
     else:
         return torch.tensor(0.0).to(device)
 def generate_images(prompt, concept):
     global pipe, device, elastic_transformer
     if pipe is None:
@@ -74,96 +80,133 @@ def generate_images(prompt, concept):
     if elastic_transformer is None:
         elastic_transformer = init_transformers(device)
-    # Configuration
-    height, width = 384, 384
-    guidance_scale = 8
-    num_inference_steps = 45
-    loss_scale = 10.0
-    # Create scheduler
-    scheduler = LMSDiscreteScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000
-    )
-    pipe.scheduler = scheduler  # Set the scheduler
-    # Create prompt text
     prompt_text = f"{prompt} {concept}"
-    # Predefined seeds for each loss function
-    seeds = {
-        'none': 42,
-        'blue': 123,
-        'elastic': 456,
-        'symmetry': 789,
-        'saturation': 1000
-    }
     loss_functions = ['none', 'blue', 'elastic', 'symmetry', 'saturation']
-    images = []
     progress = gr.Progress()
-    # Generate image for each loss function
     for idx, loss_type in enumerate(loss_functions):
         progress(idx/len(loss_functions), f"Generating {loss_type} image...")
-        generator = torch.manual_seed(seeds[loss_type])
-        # Generate base image
         try:
-            output = pipe(
-                prompt_text,
-                height=height,
-                width=width,
-                num_inference_steps=num_inference_steps,
-                guidance_scale=guidance_scale,
-                generator=generator
             )
-        except Exception as e:
-            print(f"Error generating image: {e}")
-            return None
-        # Apply loss function if not 'none'
-        if loss_type != 'none':
-            try:
-                # Convert PIL image to tensor and move to device
-                image_tensor = T.ToTensor()(output.images[0]).unsqueeze(0).to(device)
-                # Apply loss and update image
-                loss = image_loss(image_tensor, loss_type, device, elastic_transformer)
-                image_tensor = image_tensor - loss_scale * loss
-                # Move back to CPU and convert to PIL
-                image = T.ToPILImage()(image_tensor.cpu().squeeze(0).clamp(0, 1))
-            except Exception as e:
-                print(f"Error applying {loss_type} loss: {e}")
-                image = output.images[0]  # Use original image if loss fails
-        else:
-            image = output.images[0]
-        # Add image with its label
-        try:
-            # Ensure image is in correct format (PIL.Image)
-            if not isinstance(image, Image.Image):
-                print(f"Warning: Converting {loss_type} image to PIL format")
-                image = Image.fromarray(image)
-            # Add tuple of (image, label) to list
-            images.append((image, f"{loss_type.capitalize()} Loss"))
-            print(f"Added {loss_type} image to gallery")  # Debug print
         except Exception as e:
-            print(f"Error adding {loss_type} image to gallery: {e}")
-            continue
-        # Clear GPU memory after each image
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-    # Return all generated images
-    print(f"Returning {len(images)} images")
-    if not images:
         return None
-    return images
 def create_interface():
     default_prompts = [
@@ -187,14 +230,13 @@ def create_interface():
             gr.Dropdown(choices=concepts, label="Select SD Concept")
         ],
         outputs=gr.Gallery(
-            label="Generated Images (From Left to Right: Original, Blue Loss, Elastic Loss, Symmetry Loss, Saturation Loss)",
             show_label=True,
             elem_id="gallery",
             columns=5,
             rows=1,
-            height=512,
-            object_fit="contain"
-        ),  # Simplified Gallery definition
         title="Stable Diffusion using Text Inversion",
         description="""Generate images using Stable Diffusion with different style concepts. The output shows 5 images side by side:
         1. Original Image (No Loss)
@@ -204,16 +246,18 @@ def create_interface():
         5. Saturation Loss - Modifies color saturation
         Note: Image generation may take several minutes. Please be patient while the images are being processed.""",
-        flagging_mode="never"  # Updated from allow_flagging
     )
     return interface
 if __name__ == "__main__":
     interface = create_interface()
-    interface.queue(max_size=5)  # Simplified queue configuration
     interface.launch(
         share=True,
         server_name="0.0.0.0",
-        max_threads=1
     )

     else:
         return torch.tensor(0.0).to(device)
+# Update configuration
+height, width = 512, 512
+guidance_scale = 8
+num_inference_steps = 50
+loss_scale = 200
 def generate_images(prompt, concept):
     global pipe, device, elastic_transformer
     if pipe is None:
     if elastic_transformer is None:
         elastic_transformer = init_transformers(device)
+    # Create prompt text and initialize results
     prompt_text = f"{prompt} {concept}"
+    all_images = []  # Changed from images to all_images
+    # Process each loss type
     loss_functions = ['none', 'blue', 'elastic', 'symmetry', 'saturation']
     progress = gr.Progress()
     for idx, loss_type in enumerate(loss_functions):
         progress(idx/len(loss_functions), f"Generating {loss_type} image...")
         try:
+            # Better memory management
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+                torch.cuda.empty_cache()
+            # Move inputs to correct device and dtype
+            # Remove incorrect device movement
+            # text_input = text_input.to(device)  # Remove this line
+            # uncond_input = uncond_input.to(device)  # Remove this line
+            # latents = latents.to(dtype=pipe.vae.dtype, device=device)  # Remove this line
+            # Initialize scheduler and process text first
+            scheduler = LMSDiscreteScheduler(
+                beta_start=0.00085,
+                beta_end=0.012,
+                beta_schedule="scaled_linear",
+                num_train_timesteps=1000
             )
+            scheduler.set_timesteps(num_inference_steps)
+            scheduler.timesteps = scheduler.timesteps.to(device)
+            # Process text embeddings
+            text_input = pipe.tokenizer(
+                [prompt_text],
+                padding='max_length',
+                max_length=pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt"
+            )
+            with torch.no_grad():
+                text_embeddings = pipe.text_encoder(text_input.input_ids.to(device))[0]
+            uncond_input = pipe.tokenizer(
+                [""] * 1,
+                padding="max_length",
+                max_length=text_input.input_ids.shape[-1],
+                return_tensors="pt"
+            )
+            with torch.no_grad():
+                uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(device))[0]
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            # Generate initial latents with correct dtype
+            generator = torch.manual_seed(idx * 1000)
+            latents = torch.randn(
+                (1, pipe.unet.config.in_channels, height // 8, width // 8),
+                generator=generator,
+            )
+            latents = latents.to(device=device, dtype=pipe.unet.dtype)
+            latents = latents * scheduler.init_noise_sigma
+            # Diffusion process
+            for i, t in enumerate(scheduler.timesteps):
+                latent_model_input = torch.cat([latents] * 2)
+                sigma = scheduler.sigmas[i]
+                latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+                # Move latent_model_input to correct dtype
+                latent_model_input = latent_model_input.to(dtype=pipe.unet.dtype)
+                with torch.no_grad():
+                    noise_pred = pipe.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=text_embeddings
+                    )["sample"]
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # Apply loss every 5 steps if not 'none'
+                if loss_type != 'none' and i % 5 == 0:
+                    latents = latents.detach().requires_grad_()
+                    latents_x0 = latents - sigma * noise_pred
+                    # Decode to image space for loss computation
+                    with torch.set_grad_enabled(True):  # Enable gradients for loss computation
+                        denoised_images = pipe.vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5
+                        denoised_images = denoised_images.requires_grad_()  # Enable gradients for images
+                        loss = image_loss(denoised_images, loss_type, device, elastic_transformer)
+                        cond_grad = torch.autograd.grad(loss * loss_scale, latents)[0]
+                    latents = latents.detach() - cond_grad * sigma**2
+                latents = scheduler.step(noise_pred, t, latents).prev_sample
+            # Proper latent to image conversion
+            latents = (1 / 0.18215) * latents
+            with torch.no_grad():
+                image = pipe.vae.decode(latents).sample
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+            image = (image * 255).round().astype("uint8")
+            pil_image = Image.fromarray(image[0])
+            # Add image with its label
+            all_images.append((pil_image, f"{loss_type.capitalize()} Loss"))
         except Exception as e:
+            print(f"Error generating {loss_type} image: {e}")
+            continue  # Continue to next loss type instead of returning None
+    # At the end of the function
+    try:
+        if len(all_images) == 0:
+            raise Exception("No images were generated successfully")
+        return [img for img, _ in all_images]
+    except Exception as e:
+        print(f"Error in generate_images: {e}")
         return None
 def create_interface():
     default_prompts = [
             gr.Dropdown(choices=concepts, label="Select SD Concept")
         ],
         outputs=gr.Gallery(
+            label="Generated Images",
             show_label=True,
             elem_id="gallery",
             columns=5,
             rows=1,
+            height="auto"
+        ),
         title="Stable Diffusion using Text Inversion",
         description="""Generate images using Stable Diffusion with different style concepts. The output shows 5 images side by side:
         1. Original Image (No Loss)
         5. Saturation Loss - Modifies color saturation
         Note: Image generation may take several minutes. Please be patient while the images are being processed.""",
+        cache_examples=False,
+        max_batch_size=1,
+        flagging_mode="never"
     )
     return interface
 if __name__ == "__main__":
     interface = create_interface()
+    interface.queue(max_size=5)  # Remove concurrency_count parameter
     interface.launch(
         share=True,
         server_name="0.0.0.0",
+        server_port=7860
     )