FLUX.MF-Lightning-Fast-Upscaler

Running on Zero

App Files Files Community

LPX55 commited on Mar 7

Commit

4af365d

verified ·

1 Parent(s): b1c8464

Update optimized.py

Browse files

Files changed (1) hide show

optimized.py +53 -40

optimized.py CHANGED Viewed

@@ -8,70 +8,83 @@ from accelerate import init_empty_weights
 huggingface_token = os.getenv("HUGGINFACE_TOKEN")
-good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae",
-                                         torch_dtype=torch.bfloat16,
-                                         # variant="4bit",
-                                         device_map="balanced",
-                                         use_safetensors=True,
-                                         token=huggingface_token).to("cuda")
-# Load pipeline
 controlnet = FluxControlNetModel.from_pretrained(
     "jasperai/Flux.1-dev-Controlnet-Upscaler",
     torch_dtype=torch.bfloat16
 )
-#with init_empty_weights():
 pipe = FluxControlNetPipeline.from_pretrained(
     "LPX55/FLUX.1-merged_uncensored",
     controlnet=controlnet,
-    torch_dtype=torch.bfloat16,
-    device_map="balanced",
     vae=good_vae,
-    use_safetensors=True,
     token=huggingface_token
 )
-pipe.enable_model_cpu_offload(device="cuda")
-# Add to your pipeline initialization:
-# pipe.enable_xformers_memory_efficient_attention()
-# pipe.enable_vae_slicing()  # Batch processing of VAE
-# pipe.enable_model_cpu_offload()  # Use with accelerate
 try:
     import xformers
     pipe.enable_xformers_memory_efficient_attention()
 except ImportError:
     print("XFormers missing! Using PyTorch attention instead")
-    # Fallback to PyTorch 2.0+ memory efficient attention
     pipe.enable_sdp_attention()
     torch.backends.cuda.enable_flash_sdp(True)
-# Convert all models to memory-efficient format
-#pipe.to(memory_format=torch.channels_last)
-pipe.to("cuda")
 @spaces.GPU
 def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):
-    # Load control image
-    control_image = control_image.resize((int(w * scale), int(h * scale)), PIL.Image.BICUBIC)
-    # control_image = load_image(control_image)
     w, h = control_image.size
-    # Upscale x1
-    control_image = control_image.resize((int(w * scale), int(h * scale)))
-    print("Size to: " + str(control_image.size[0]) + ", " + str(control_image.size[1]))
-    image = pipe(
-        prompt=prompt,
-        control_image=control_image,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        num_inference_steps=steps,
-        guidance_scale=guidance_scale,
-        height=control_image.size[1],
-        width=control_image.size[0],
-        torch_dtype=torch.bfloat16,
-        device_map="balanced"
-    ).images[0]
     torch.cuda.empty_cache()
     return image
 # Create Gradio interface
 iface = gr.Interface(
     fn=generate_image,

 huggingface_token = os.getenv("HUGGINFACE_TOKEN")
+good_vae = AutoencoderKL.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    subfolder="vae",
+    torch_dtype=torch.bfloat16,
+    use_safetensors=True,
+    device_map=None,  # Disable automatic mapping
+    token=huggingface_token
+)
 controlnet = FluxControlNetModel.from_pretrained(
     "jasperai/Flux.1-dev-Controlnet-Upscaler",
     torch_dtype=torch.bfloat16
 )
+# Initialize pipeline without automatic device mapping
 pipe = FluxControlNetPipeline.from_pretrained(
     "LPX55/FLUX.1-merged_uncensored",
     controlnet=controlnet,
     vae=good_vae,
+    torch_dtype=torch.bfloat16,
+    use_safetensors=True,
+    device_map=None,  # Disable automatic device mapping
     token=huggingface_token
 )
+print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
+# Proper CPU offloading sequence
+pipe.enable_model_cpu_offload(device="cuda")  # First enable offloading
+pipe.enable_vae_slicing()  # Then enable memory optimizations
+pipe.enable_attention_slicing(1)
+# Handle xformers/SDP attention after offloading
 try:
     import xformers
     pipe.enable_xformers_memory_efficient_attention()
 except ImportError:
     print("XFormers missing! Using PyTorch attention instead")
     pipe.enable_sdp_attention()
     torch.backends.cuda.enable_flash_sdp(True)
+# Memory format optimization (only after other configs)
+pipe.to(memory_format=torch.channels_last)
+print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
 @spaces.GPU
 def generate_image(prompt, scale, steps, control_image, controlnet_conditioning_scale, guidance_scale):
+    # Clean up input handling
     w, h = control_image.size
+    scale = min(scale, 2.0)  # Cap scale factor
+    # Size calculation with safety limits
+    max_dim = 1536  # Set based on your VRAM
+    target_w = min(int(w * scale), max_dim)
+    target_h = min(int(h * scale), max_dim)
+    control_image = control_image.resize(
+        (target_w, target_h),
+        PIL.Image.BICUBIC
+    )
+    # Generation with memory-friendly parameters
+    with torch.autocast("cuda"):  # Mixed precision
+        image = pipe(
+            prompt=prompt,
+            control_image=control_image,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            num_inference_steps=steps,
+            guidance_scale=guidance_scale,
+            height=target_h,
+            width=target_w,
+            output_type="pil",  # Avoid extra latent decoding steps
+            generator=torch.Generator(device="cuda").manual_seed(0)
+        ).images[0]
+    print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
+    # Aggressive memory cleanup
     torch.cuda.empty_cache()
+    torch.cuda.ipc_collect()
+    print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
     return image
 # Create Gradio interface
 iface = gr.Interface(
     fn=generate_image,