Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

Tianshuo-Xu commited on Mar 15

Commit

c49775d

1 Parent(s): 2292172

Speed up Space by initializing globally and keeping on GPU, remove manual offload

Files changed (2) hide show

app.py CHANGED Viewed

@@ -223,7 +223,7 @@ def init_generator():
         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
-            offload=True,
             intern_vlm_path=intern_vlm_path,
             checkpoint_path=checkpoint_path,
             font_descriptions_path='dataset/chirography.json',
@@ -261,9 +261,14 @@ def parse_font_style(font_style: str) -> str:
     return None
 def _get_generation_duration(text, font, author, num_steps, start_seed, num_images):
-    """Calculate dynamic GPU duration: 60s for model loading + 2s per step per image"""
-    return 60 + int(2 * num_steps * num_images)
 @spaces.GPU(duration=_get_generation_duration)
@@ -273,8 +278,8 @@ def run_generation(text, font, author, num_steps, start_seed, num_images):
     All in one GPU session to avoid redundant loading.
     """
     # Step 1: Load model
-    logger.info("Loading model...")
-    gen = init_generator()
     # Step 2: Since we reverted to bf16 load to avoid PyTorch native dtype mix issues, skip wrapping
     logger.info("Model weights decompressed to bfloat16 upon load. Skipping dynamic quantization to ensure stability.")

         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
+            offload=False,  # Set to False to let ZeroGPU manage CUDA memory directly instead of manual CPU thrashing
             intern_vlm_path=intern_vlm_path,
             checkpoint_path=checkpoint_path,
             font_descriptions_path='dataset/chirography.json',
     return None
+# Initialize the generator globally BEFORE zeroGPU functions so weights are memory-mapped
+logger.info("Initializing generator globally...")
+generator = init_generator()
 def _get_generation_duration(text, font, author, num_steps, start_seed, num_images):
+    """Calculate dynamic GPU duration: 20s base + 2s per step per image"""
+    return 20 + int(2 * num_steps * num_images)
 @spaces.GPU(duration=_get_generation_duration)
     All in one GPU session to avoid redundant loading.
     """
     # Step 1: Load model
+    logger.info("Models are already globally initialized and managed by ZeroGPU.")
+    gen = generator
     # Step 2: Since we reverted to bf16 load to avoid PyTorch native dtype mix issues, skip wrapping
     logger.info("Model weights decompressed to bfloat16 upon load. Skipping dynamic quantization to ensure stability.")

src/flux/xflux_pipeline.py CHANGED Viewed

@@ -312,9 +312,9 @@ class XFluxPipeline:
         neg_ip_scale=1.0,
         is_generation=True,
     ):
-        # Disable TF32 to avoid CUBLAS errors with fp16 on some GPUs
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
         # Determine inference dtype from model
         if hasattr(self.model, '_is_quantized') and self.model._is_quantized:

         neg_ip_scale=1.0,
         is_generation=True,
     ):
+        # Allow TF32 for much faster inference
+        # torch.backends.cuda.matmul.allow_tf32 = False
+        # torch.backends.cudnn.allow_tf32 = False
         # Determine inference dtype from model
         if hasattr(self.model, '_is_quantized') and self.model._is_quantized: