Spaces:

bmarci
/

NextStep-1-Large

Running on Zero

App Files Files Community

bmarci commited on Aug 23, 2025

Commit

18907bb

1 Parent(s): 4d5f907

Revert "adjustable cfg, better examples"

Browse files

Files changed (1) hide show

app.py +42 -130

app.py CHANGED Viewed

@@ -25,8 +25,6 @@ pipeline = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device,
 MAX_SEED = np.iinfo(np.int16).max
 DEFAULT_POSITIVE_PROMPT = None
 DEFAULT_NEGATIVE_PROMPT = None
-DEFAULT_CFG = 7.5
 def _ensure_pil(x):
     """Ensure returned image is a PIL.Image.Image."""
@@ -38,79 +36,46 @@ def _ensure_pil(x):
     if isinstance(x, np.ndarray):
         if x.dtype != np.uint8:
             x = (x * 255.0).clip(0, 255).astype(np.uint8)
-        if x.ndim == 3 and x.shape[0] in (1, 3, 4):  # CHW -> HWC
             x = np.moveaxis(x, 0, -1)
         return Image.fromarray(x)
     raise TypeError("Unsupported image type returned by pipeline.")
-def calculate_gpu_duration(width, height, num_inference_steps):
-    """
-    Calculate GPU duration based on image dimensions and inference steps.
-    Base calculation:
-    - Minimum: 60 seconds for smallest images
-    - Scales with total pixels and number of steps
-    - Maximum: 600 seconds for safety
-    """
-    # Total pixels (in millions)
-    total_pixels = (width * height) / 1_000_000
-    # Base duration: assume ~1 second per megapixel per step as baseline
-    # Adjust the multiplier based on your model's actual performance
-    base_duration = total_pixels * num_inference_steps * 0.5
-    # Add overhead for model loading and post-processing
-    overhead = 30
-    # Calculate final duration with min/max bounds
-    duration = int(base_duration + overhead)
-    duration = max(60, min(duration, 600))  # Between 60 and 600 seconds
-    return duration
 def infer(
-        prompt=None,
-        seed=0,
-        width=512,
-        height=512,
-        num_inference_steps=28,
-        cfg=DEFAULT_CFG,
-        positive_prompt=DEFAULT_POSITIVE_PROMPT,
-        negative_prompt=DEFAULT_NEGATIVE_PROMPT,
-        progress=gr.Progress(track_tqdm=True),
 ):
-    """Run inference at exactly (width, height) with dynamic GPU allocation."""
     if prompt in [None, ""]:
         gr.Warning("⚠️ Please enter a prompt!")
         return None
-    # Calculate dynamic duration based on image size
-    gpu_duration = calculate_gpu_duration(width, height, num_inference_steps)
-    # Use context manager for dynamic GPU allocation
-    with spaces.GPU(duration=gpu_duration):
-        with autocast(device_type=("cuda" if device == "cuda" else "cpu"), dtype=torch.bfloat16):
-            imgs = pipeline.generate_image(
-                prompt,
-                hw=(int(height), int(width)),
-                num_images_per_caption=1,
-                positive_prompt=positive_prompt,
-                negative_prompt=negative_prompt,
-                cfg=float(cfg),
-                cfg_img=1.0,
-                cfg_schedule="constant",
-                use_norm=False,
-                num_sampling_steps=int(num_inference_steps),
-                timesteps_shift=1.0,
-                seed=int(seed),
-                progress=True,
-            )
     return _ensure_pil(imgs[0])  # Return raw output exactly as generated
 css = """
 #col-container {
     margin: 0 auto;
@@ -120,7 +85,7 @@ css = """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown("# NextStep-1-Large — Image generation")
         with gr.Row():
             prompt = gr.Text(
@@ -179,14 +144,6 @@ with gr.Blocks(css=css) as demo:
                         step=64,
                         value=512,
                     )
-                cfg = gr.Slider(
-                    label="CFG (guidance scale)",
-                    minimum=0.0,
-                    maximum=20.0,
-                    step=0.5,
-                    value=DEFAULT_CFG,
-                    info="Higher = closer to text, lower = more creative",
-                )
         with gr.Row():
             result_1 = gr.Image(
@@ -197,66 +154,25 @@ with gr.Blocks(css=css) as demo:
                 format="png",
             )
         examples = [
             [
-                "Studio portrait of an elderly sailor with a weathered face, dramatic Rembrandt lighting, shallow depth of field",
-                101, 512, 512, 32, 7.5,
-                "photorealistic, sharp eyes, detailed skin texture, soft rim light, 85mm lens",
-                "over-smoothed skin, plastic look, extra limbs, watermark",
             ],
             [
-                "Isometric cozy coffee shop interior with hanging plants and warm Edison bulbs",
-                202, 512, 384, 30, 8.5,
-                "isometric view, clean lines, stylized, warm ambience, detailed furniture",
-                "text, logo, watermark, perspective distortion",
             ],
             [
-                "Ultra-wide desert canyon at golden hour with long shadows and dust in the air",
-                303, 512, 320, 28, 7.0,
-                "cinematic, volumetric light, natural colors, high dynamic range",
-                "over-saturated, haze artifacts, blown highlights",
-            ],
-            [
-                "Cute red panda astronaut sticker, chibi style, white background",
-                404, 384, 384, 24, 9.0,
-                "vector look, bold outlines, high contrast, die-cut silhouette",
-                "background clutter, drop shadow, gradients, text",
-            ],
-            [
-                "Product render of matte-black wireless headphones on reflective glass with soft studio lighting",
-                505, 512, 384, 28, 7.0,
-                "clean backdrop, realistic reflections, subtle bloom, high detail",
-                "noise, fingerprints, text, label",
-            ],
-            [
-                "Graphic poster in Bauhaus style with geometric shapes and bold typography placeholders",
-                606, 512, 512, 22, 6.0,
-                "flat colors, minimal palette, crisp edges, balanced composition",
-                "photo realism, gradients, noisy texture",
-            ],
-            [
-                "Oil painting of a stormy sea with a lighthouse, thick impasto brushwork",
-                707, 384, 512, 34, 7.0,
-                "textured canvas, visible brush strokes, dramatic sky, moody lighting",
-                "smooth digital look, airbrush, neon colors",
-            ],
-            [
-                "Architectural concept art: glass pavilion in a pine forest at dawn, ground fog",
-                808, 512, 384, 30, 8.0,
-                "physically-based rendering, soft fog, realistic materials, scale figures",
-                "tilt, skew, warped geometry, chromatic aberration",
-            ],
-            [
-                "Fantasy creature: bioluminescent jellyfish dragon swimming through a dark ocean trench",
-                909, 512, 512, 32, 8.5,
-                "glowing tendrils, soft caustics, particles, high detail",
-                "washed out, murky, low contrast, extra heads",
-            ],
-            [
-                "Line art coloring page of a city skyline with hot air balloons",
-                111, 512, 512, 18, 5.5,
-                "clean black outlines, uniform stroke weight, high contrast, no shading",
-                "gray fill, gradients, cross-hatching, text",
             ],
         ]
@@ -268,18 +184,15 @@ with gr.Blocks(css=css) as demo:
                 width,
                 height,
                 num_inference_steps,
-                cfg,
                 positive_prompt,
                 negative_prompt,
             ],
             label="Click & Fill Examples (Exact Size)",
         )
     def show_result():
         return gr.update(visible=True)
     generation_event = gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
@@ -289,7 +202,6 @@ with gr.Blocks(css=css) as demo:
             width,
             height,
             num_inference_steps,
-            cfg,
             positive_prompt,
             negative_prompt,
         ],
@@ -299,4 +211,4 @@ with gr.Blocks(css=css) as demo:
     cancel_button.click(fn=None, inputs=None, outputs=None, cancels=[generation_event])
 if __name__ == "__main__":
-    demo.launch()

 MAX_SEED = np.iinfo(np.int16).max
 DEFAULT_POSITIVE_PROMPT = None
 DEFAULT_NEGATIVE_PROMPT = None
 def _ensure_pil(x):
     """Ensure returned image is a PIL.Image.Image."""
     if isinstance(x, np.ndarray):
         if x.dtype != np.uint8:
             x = (x * 255.0).clip(0, 255).astype(np.uint8)
+        if x.ndim == 3 and x.shape[0] in (1,3,4):  # CHW -> HWC
             x = np.moveaxis(x, 0, -1)
         return Image.fromarray(x)
     raise TypeError("Unsupported image type returned by pipeline.")
+@spaces.GPU(duration=300)
 def infer(
+    prompt=None,
+    seed=0,
+    width=512,
+    height=512,
+    num_inference_steps=28,
+    positive_prompt=DEFAULT_POSITIVE_PROMPT,
+    negative_prompt=DEFAULT_NEGATIVE_PROMPT,
+    progress=gr.Progress(track_tqdm=True),
 ):
+    """Run inference at exactly (width, height)."""
     if prompt in [None, ""]:
         gr.Warning("⚠️ Please enter a prompt!")
         return None
+    with autocast(device_type=("cuda" if device == "cuda" else "cpu"), dtype=torch.bfloat16):
+        imgs = pipeline.generate_image(
+            prompt,
+            hw=(int(height), int(width)),
+            num_images_per_caption=1,
+            positive_prompt=positive_prompt,
+            negative_prompt=negative_prompt,
+            cfg=7.5,
+            cfg_img=1.0,
+            cfg_schedule="constant",
+            use_norm=False,
+            num_sampling_steps=int(num_inference_steps),
+            timesteps_shift=1.0,
+            seed=int(seed),
+            progress=True,
+        )
     return _ensure_pil(imgs[0])  # Return raw output exactly as generated
 css = """
 #col-container {
     margin: 0 auto;
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown("# NextStep-1-Large — Exact Output Size")
         with gr.Row():
             prompt = gr.Text(
                         step=64,
                         value=512,
                     )
         with gr.Row():
             result_1 = gr.Image(
                 format="png",
             )
+        # Click & Fill Examples (all <=512px)
         examples = [
             [
+                "A cozy wooden cabin by a frozen lake, northern lights in the sky",
+                123, 512, 512, 28,
+                "photorealistic, cinematic lighting, starry night, glowing reflections",
+                "low-res, distorted, extra objects"
             ],
             [
+                "Futuristic city skyline at sunset, flying cars, neon reflections",
+                456, 512, 384, 30,
+                "detailed, vibrant, cinematic, sharp edges",
+                "washed out, cartoon, blurry"
             ],
             [
+                "Close-up of a rare orchid in a greenhouse with soft morning light",
+                789, 384, 512, 32,
+                "macro lens effect, ultra-detailed petals, dew drops",
+                "grainy, noisy, oversaturated"
             ],
         ]
                 width,
                 height,
                 num_inference_steps,
                 positive_prompt,
                 negative_prompt,
             ],
             label="Click & Fill Examples (Exact Size)",
         )
     def show_result():
         return gr.update(visible=True)
     generation_event = gr.on(
         triggers=[run_button.click, prompt.submit],
         fn=infer,
             width,
             height,
             num_inference_steps,
             positive_prompt,
             negative_prompt,
         ],
     cancel_button.click(fn=None, inputs=None, outputs=None, cancels=[generation_event])
 if __name__ == "__main__":
+    demo.launch()