darkbit1001
/

Stable-Diffusion-1.5-LCM-ONNX-RKNN2

ONNX

rknn

LCM

stable-diffusion

Model card Files Files and versions

xet

Community

darkbit1001 commited on 5 days ago

Commit

8572c72

1 Parent(s): 8d44598

fix data-format

Browse files

Files changed (2) hide show

lcm_server.py +2 -2
rknnlcm.py +107 -16

lcm_server.py CHANGED Viewed

@@ -106,7 +106,7 @@ class PipelineWorker:
         print("seed ", job.req.seed)
         print("rng", rng)
         result = self.pipe(
             prompt=job.req.prompt,
             height=h,
@@ -114,7 +114,7 @@ class PipelineWorker:
             num_inference_steps=job.req.num_inference_steps,
             guidance_scale=job.req.guidance_scale,
             generator=rng,
-        )
         pil_image = result["images"][0]
         buf = io.BytesIO()

         print("seed ", job.req.seed)
         print("rng", rng)
         result = self.pipe(
             prompt=job.req.prompt,
             height=h,
             num_inference_steps=job.req.num_inference_steps,
             guidance_scale=job.req.guidance_scale,
             generator=rng,
+        )
         pil_image = result["images"][0]
         buf = io.BytesIO()

rknnlcm.py CHANGED Viewed

@@ -43,9 +43,10 @@ class RKNN2Model:
         *,
         core_mask: Optional[Union[str, int]] = None,
         multi_context: bool = True,
-        data_format: str = "nchw",
         verbose_shapes: bool = False,
         runtime_kwargs: Optional[dict] = None,
         **_ignored: Any,
     ):
         """
@@ -61,6 +62,8 @@ class RKNN2Model:
         - runtime_kwargs: optional extra kwargs to pass into init_runtime(...)
         - **_ignored: allows you to pass context_name/worker_id etc without breaking
         """
         self.model_dir = model_dir
         self.data_format = data_format
         self.verbose_shapes = verbose_shapes
@@ -123,23 +126,32 @@ class RKNN2Model:
         raise TypeError(f"core_mask must be None, int, or str; got {type(core_mask)}")
     def __call__(self, **kwargs) -> List[np.ndarray]:
-        # Preserve order of kwargs values as given by caller
-        input_list = list(kwargs.values())
-        if self.verbose_shapes:
-            for i, arr in enumerate(input_list):
-                if isinstance(arr, np.ndarray):
-                    logger.info(f"[{self.modelname}] input[{i}] shape={arr.shape} dtype={arr.dtype}")
         results = self.rknnlite.inference(inputs=input_list, data_format=self.data_format)
-        if self.verbose_shapes:
-            for j, res in enumerate(results):
-                if isinstance(res, np.ndarray):
-                    logger.info(f"[{self.modelname}] output[{j}] shape={res.shape} dtype={res.dtype}")
         return results
 class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
     def __init__(
@@ -379,6 +391,11 @@ class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
                     f" {negative_prompt_embeds.shape}."
                 )
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
@@ -540,6 +557,7 @@ class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
         timestep_dtype = np.int64
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         inference_start = time.time()
         for i, t in enumerate(self.progress_bar(timesteps)):
             timestep = np.array([t], dtype=timestep_dtype)
@@ -654,9 +672,9 @@ def generate_png_bytes(args):
     user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
     pipe = RKNN2LatentConsistencyPipeline(
-        text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
-        unet=RKNN2Model(os.path.join(args.i, "unet")),
-        vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
         scheduler=user_specified_scheduler,
         tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
     )
@@ -680,3 +698,76 @@ def generate_png_bytes(args):
     buf.seek(0)
     return buf.getvalue()

         *,
         core_mask: Optional[Union[str, int]] = None,
         multi_context: bool = True,
+        data_format: str = "nhwc",
         verbose_shapes: bool = False,
         runtime_kwargs: Optional[dict] = None,
+        force_fp32=True,
         **_ignored: Any,
     ):
         """
         - runtime_kwargs: optional extra kwargs to pass into init_runtime(...)
         - **_ignored: allows you to pass context_name/worker_id etc without breaking
         """
+        self.data_format = data_format.lower()
+        self.force_fp32 = force_fp32
         self.model_dir = model_dir
         self.data_format = data_format
         self.verbose_shapes = verbose_shapes
         raise TypeError(f"core_mask must be None, int, or str; got {type(core_mask)}")
     def __call__(self, **kwargs) -> List[np.ndarray]:
+        # TODO We need deterministic ordering
+        input_list = [self._prep(v) for v in kwargs.values()]
         results = self.rknnlite.inference(inputs=input_list, data_format=self.data_format)
+        logger.info("%s out[0] shape=%s dtype=%s",
+            self.modelname, results[0].shape, results[0].dtype)
         return results
+    def _prep(self, x):
+        import numpy as np
+        if isinstance(x, np.ndarray):
+            # dtype safety
+            if self.force_fp32 and x.dtype in (np.float64, np.float16):
+                x = x.astype(np.float32, copy=False)
+            # layout safety for 4D tensors
+            if x.ndim == 4:
+                if self.data_format == "nhwc" and x.shape[1] in (1, 3, 4):  # likely NCHW
+                    x = x.transpose(0, 2, 3, 1)
+                elif self.data_format == "nchw" and x.shape[-1] in (1, 3, 4):  # likely NHWC
+                    x = x.transpose(0, 3, 1, 2)
+            x = np.ascontiguousarray(x)
+        return x
 class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
     def __init__(
                     f" {negative_prompt_embeds.shape}."
                 )
+    # Keep latents in NCHW everywhere in Python, and only convert to NHWC right at the RKNN boundary for models that require it.
+    #That means:
+    #•   Before UNet RKNN call: NCHW -> NHWC
+    #•   After UNet RKNN call: NHWC -> NCHW (only if the raw output is NHWC)
+    #•   VAE decoder input: if it expects NHWC, convert right before it too.
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         timestep_dtype = np.int64
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         inference_start = time.time()
         for i, t in enumerate(self.progress_bar(timesteps)):
             timestep = np.array([t], dtype=timestep_dtype)
     user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
     pipe = RKNN2LatentConsistencyPipeline(
+        text_encoder = RKNN2Model(os.path.join(args.i, "text_encoder"), data_format="nchw"),  # probably irrelevant
+        unet        = RKNN2Model(os.path.join(args.i, "unet"),        data_format="nhwc"),  # important
+        vae_decoder = RKNN2Model(os.path.join(args.i, "vae_decoder"), data_format="nhwc"),   # important
         scheduler=user_specified_scheduler,
         tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
     )
     buf.seek(0)
     return buf.getvalue()
+def main(args):
+    logger.info(f"Setting random seed to {args.seed}")
+    # load scheduler from scheduler/scheduler_config.json
+    scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
+    with open(scheduler_config_path, "r") as f:
+        scheduler_config = json.load(f)
+    user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
+    logger.info("Using scheduler: %s", user_specified_scheduler.__class__.__name__)
+    # Parse size as WIDTHxHEIGHT (common CLI convention)
+    w_str, h_str = args.size.lower().split("x")
+    width, height = int(w_str), int(h_str)
+    pipe = RKNN2LatentConsistencyPipeline(
+        text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder"), data_format="nchw"),
+        unet=RKNN2Model(os.path.join(args.i, "unet"), data_format="nhwc"),
+        vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder"), data_format="nhwc"),
+        scheduler=user_specified_scheduler,
+        tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
+    )
+    logger.info("Beginning image generation.")
+    out = pipe(
+        prompt=args.prompt,
+        height=height,
+        width=width,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        generator=np.random.RandomState(args.seed),
+    )
+    out_path = get_image_path(args)
+    logger.info("Saving generated image to %s", out_path)
+    out["images"][0].save(out_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="The text prompt to be used for text-to-image generation.")
+    parser.add_argument(
+        "-i",
+        required=True,
+        help=("Path to model directory"))
+    parser.add_argument("-o", required=True)
+    parser.add_argument("--seed",
+                        default=93,
+                        type=int,
+                        help="Random seed to be able to reproduce results")
+    parser.add_argument(
+        "-s",
+        "--size",
+        default="256x256",
+        type=str,
+        help="Image size")
+    parser.add_argument(
+        "--num-inference-steps",
+        default=4,
+        type=int,
+        help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
+    parser.add_argument(
+        "--guidance-scale",
+        default=7.5,
+        type=float,
+        help="Controls the influence of the text prompt on sampling process (0=random images)")
+    args = parser.parse_args()
+    main(args)