darkbit1001 commited on Jan 12

Commit

be0e7db

1 Parent(s): c40da80

refactor for lcm server using uvicorn

Browse files

Files changed (47) hide show

.dockerignore +20 -0
.gitattributes +15 -0
.gitignore +19 -0
Dockerfile +32 -0
README.md +77 -0
convert-onnx-to-rknn.py +120 -0
lcm_server.py +316 -0
model/.gitattributes +36 -0
model/Assets/Icon.png +3 -0
model/Assets/LCM-Dreamshaper-V7-ONNX.json +35 -0
model/Assets/OnnxStack - 640x320.png +0 -0
model/Assets/Preview1.png +3 -0
model/Assets/Preview2.png +3 -0
model/Assets/Preview3.png +3 -0
model/Assets/Preview4.png +3 -0
model/Assets/Preview5.png +3 -0
model/Assets/Preview6.png +3 -0
model/Assets/lcm_angel_30_7.5_2092464983.png +3 -0
model/Assets/lcm_car_30_7.5_2092464983.png +3 -0
model/Assets/lcm_demon_30_7.5_2092464983.png +3 -0
model/Assets/lcm_ninja_30_7.5_2092464983.png +3 -0
model/README.md +56 -0
model/feature_extractor/preprocessor_config.json +28 -0
model/model_index.json +34 -0
model/scheduler/scheduler_config.json +20 -0
model/text_encoder/config.json +25 -0
model/text_encoder/model.onnx +3 -0
model/text_encoder/model.rknn +3 -0
model/tokenizer/merges.txt +0 -0
model/tokenizer/model.onnx +3 -0
model/tokenizer/special_tokens_map.json +30 -0
model/tokenizer/tokenizer_config.json +31 -0
model/tokenizer/vocab.json +0 -0
model/unet/config.json +68 -0
model/unet/model.onnx +3 -0
model/unet/model.onnx_data +3 -0
model/unet/model.rknn +3 -0
model/vae_decoder/config.json +32 -0
model/vae_decoder/model.onnx +3 -0
model/vae_decoder/model.rknn +3 -0
model/vae_encoder/config.json +32 -0
model/vae_encoder/model.onnx +3 -0
requirements.txt +8 -0
rknnlcm.py +682 -0
run_onnx-lcm.py +665 -0
run_rknn-lcm.py +632 -0
runner.sh +10 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+.git
+.git/*
+model
+model/*
+__pycache__
+**/__pycache__
+images
+images/*
+.aider*
+*.pyc
+*.pyo
+*.pyd
+.env
+.venv
+venv
+dist
+build

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/text_encoder/model.rknn filter=lfs diff=lfs merge=lfs -text
+model/unet/model.onnx_data filter=lfs diff=lfs merge=lfs -text
+model/unet/model.rknn filter=lfs diff=lfs merge=lfs -text
+model/vae_decoder/model.rknn filter=lfs diff=lfs merge=lfs -text
+model/Assets/Icon.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview1.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview2.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview3.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview4.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview5.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/Preview6.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/lcm_angel_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/lcm_car_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/lcm_demon_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
+model/Assets/lcm_ninja_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,19 @@

+@eaDir/
+.DS_Store
+__pycache__
+**/__pycache__
+images
+images/*
+.aider*
+*.pyc
+*.pyo
+*.pyd
+.env
+.venv
+venv
+dist
+build

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.12-slim
+# System deps (Pillow, RKNN, etc.)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Python deps
+COPY requirements.txt /app/requirements.txt
+RUN python -m venv /venv && \
+    /venv/bin/pip install --no-cache-dir -U pip setuptools wheel && \
+    /venv/bin/pip install --no-cache-dir -r /app/requirements.txt
+# 👇 THIS copies *everything except what .dockerignore excludes*
+COPY . /app
+ENV PATH="/venv/bin:$PATH"
+ENV PYTHONUNBUFFERED=1
+ENV PORT=4200
+ENV NUM_WORKERS=1
+ENV QUEUE_MAX=8
+ENV MODEL_ROOT=/models
+EXPOSE 4200
+CMD ["uvicorn", "lcm_server:app", "--host", "0.0.0.0", "--port", "4200", "--no-access-log"]

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+---
+base_model:
+- TheyCallMeHex/LCM-Dreamshaper-V7-ONNX
+tags:
+- rknn
+- LCM
+- stable-diffusion
+---
+# Stable Diffusion 1.5 Latent Consistency Model for RKNN2
+Run the Stable Diffusion 1.5 LCM image generation model using RKNPU2!
+- Inference speed (RK3588, single NPU core):
+  - 384x384: Text encoder 0.05s + U-Net 2.36s/it + VAE Decoder 5.48s
+  - 512x512: Text encoder 0.05s + U-Net 5.65s/it + VAE Decoder 11.13s
+- Memory usage:
+  - 384x384: About 5.2GB
+  - 512x512: About 5.6GB
+## Usage
+### 1. Clone or download this repository to your local machine
+### 2. Install dependencies
+```bash
+pip install diffusers pillow numpy<2 rknn-toolkit-lite2
+```
+### 3. Run
+```bash
+python ./run_rknn-lcm.py -i ./model -o ./images --num-inference-steps 4 -s 512x512 --prompt "Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style."
+```
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/6319d0860d7478ae0069cd92/50jwBxv0Edf7x0WoHmpwi.png)
+## Model Conversion
+### Install dependencies
+```bash
+pip install diffusers pillow numpy<2 rknn-toolkit2
+```
+### 1. Download the model
+Download a Stable Diffusion 1.5 LCM model in ONNX format and place it in the `./model` directory.
+```bash
+huggingface-cli download TheyCallMeHex/LCM-Dreamshaper-V7-ONNX
+cp -r -L ~/.cache/huggingface/hub/models--TheyCallMeHex--LCM-Dreamshaper-V7-ONNX/snapshots/4029a217f9cdc0437f395738d3ab686bb910ceea ./model
+```
+In theory, you could also achieve LCM inference by merging the LCM Lora into a regular Stable Diffusion 1.5 model and then converting it to ONNX format. However, I'm not sure how to do this. If anyone knows, please feel free to submit a PR.
+### 2. Convert the model
+```bash
+# Convert the model, 384x384 resolution
+python ./convert-onnx-to-rknn.py -m ./model -r 384x384
+```
+Note that the higher the resolution, the larger the model and the longer the conversion time. It's not recommended to use very high resolutions.
+## Known Issues
+1. ~~As of now, models converted using the latest version of rknn-toolkit2 (version 2.2.0) still suffer from severe precision loss, even when using fp16 data type. As shown in the image, the top is the result of inference using the ONNX model, and the bottom is the result using the RKNN model. All parameters are the same. Moreover, the higher the resolution, the more severe the precision loss. This is a bug in rknn-toolkit2.~~ (Fixed in v2.3.0)
+2. Actually, the model conversion script can select multiple resolutions (e.g., "384x384,256x256"), but this causes the model conversion to fail. This is a bug in rknn-toolkit2.
+## References
+- [TheyCallMeHex/LCM-Dreamshaper-V7-ONNX](https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX)
+- [Optimum's LatentConsistencyPipeline](https://github.com/huggingface/optimum/blob/main/optimum/pipelines/diffusers/pipeline_latent_consistency.py)
+- [happyme531/RK3588-stable-diffusion-GPU](https://github.com/happyme531/RK3588-stable-diffusion-GPU)

convert-onnx-to-rknn.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python
+# coding: utf-8
+from typing import List
+from rknn.api import RKNN
+from math import exp
+from sys import exit
+import argparse
+def convert_pipeline_component(onnx_path: str, resolution_list: List[List[int]], target_platform: str = 'rk3588'):
+    print(f'Converting {onnx_path} to RKNN model')
+    print(f'with target platform {target_platform}')
+    print(f'with resolutions:')
+    for res in resolution_list:
+        print(f'- {res[0]}x{res[1]}')
+    use_dynamic_shape = False
+    if(len(resolution_list) > 1):
+        print("Warning: RKNN dynamic shape support is probably broken, may throw errors")
+        use_dynamic_shape = True
+    batch_size = 1
+    LATENT_RESIZE_FACTOR = 8
+    # build shape list
+    if "text_encoder" in onnx_path:
+        input_size_list = [[[1,77]]]
+        inputs=['input_ids']
+        use_dynamic_shape = False
+    elif "unet" in onnx_path:
+        # batch_size = 2  # for classifier free guidance # broken for rknn python api
+        input_size_list = []
+        for res in resolution_list:
+            input_size_list.append(
+                [[1,4, res[0]//LATENT_RESIZE_FACTOR, res[1]//LATENT_RESIZE_FACTOR],
+                 [1],
+                 [1, 77, 768],
+                 [1, 256]]
+            )
+        inputs=['sample','timestep','encoder_hidden_states','timestep_cond']
+    elif "vae_decoder" in onnx_path:
+        input_size_list = []
+        for res in resolution_list:
+            input_size_list.append(
+                [[1,4, res[0]//LATENT_RESIZE_FACTOR, res[1]//LATENT_RESIZE_FACTOR]]
+            )
+        inputs=['latent_sample']
+    else:
+        print("Unknown component: ", onnx_path)
+        exit(1)
+    rknn = RKNN(verbose=True)
+    # pre-process config
+    print('--> Config model')
+    rknn.config(target_platform='rk3588', optimization_level=3, single_core_mode=True,
+                dynamic_input= input_size_list if use_dynamic_shape else None)
+    print('done')
+    # Load ONNX model
+    print('--> Loading model')
+    ret = rknn.load_onnx(model=onnx_path,
+                         inputs=None if use_dynamic_shape else inputs,
+                         input_size_list= None if use_dynamic_shape else input_size_list[0])
+    if ret != 0:
+        print('Load model failed!')
+        exit(ret)
+    print('done')
+    # Build model
+    print('--> Building model')
+    ret = rknn.build(do_quantization=False, rknn_batch_size=batch_size)
+    if ret != 0:
+        print('Build model failed!')
+        exit(ret)
+    print('done')
+    #export
+    print('--> Export RKNN model')
+    ret = rknn.export_rknn(onnx_path.replace('.onnx', '.rknn'))
+    if ret != 0:
+        print('Export RKNN model failed!')
+        exit(ret)
+    print('done')
+    rknn.release()
+    print('RKNN model is converted successfully!')
+def parse_resolution_list(resolution: str) -> List[List[int]]:
+    resolution_pairs = resolution.split(',')
+    parsed_resolutions = []
+    for pair in resolution_pairs:
+        width, height = map(int, pair.split('x'))
+        parsed_resolutions.append([width, height])
+    return parsed_resolutions
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert Stable Diffusion ONNX models to RKNN models')
+    parser.add_argument('-m','--model-dir', type=str, help='Directory containing the Stable Diffusion ONNX models', required=True)
+    parser.add_argument('-c','--components', type=str, help='Name of the components to convert, e.g. "text_encoder,unet,vae_decoder"', default='text_encoder, unet, vae_decoder')
+    parser.add_argument('-r','--resolutions', type=str, help='Comma-separated list of resolutions for the model, e.g. "256x256,512x512"', default='256x256')
+    parser.add_argument('--target_platform', type=str, help='Target platform for the RKNN model, default is "rk3588"', default='rk3588')
+    args = parser.parse_args()
+    components = args.components.split(',')
+    for component in components:
+        onnx_path = f'{args.model_dir}/{component.strip()}/model.onnx'
+        resolution_list = parse_resolution_list(args.resolutions)
+        if(len(resolution_list) == 0):
+            print("Error: No resolutions specified")
+            exit(1)
+        convert_pipeline_component(onnx_path, resolution_list, args.target_platform)

lcm_server.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import io
+import os
+import json
+import time
+import queue
+import threading
+from dataclasses import dataclass
+from concurrent.futures import Future
+from typing import Optional, List, Tuple
+import numpy as np
+from fastapi import FastAPI, Response, HTTPException
+from pydantic import BaseModel, Field
+from diffusers import LCMScheduler
+from transformers import CLIPTokenizer
+from rknnlcm import RKNN2Model, RKNN2LatentConsistencyPipeline
+# --- Your imports (as in your script) ---
+# from your_pkg import RKNN2LatentConsistencyPipeline, RKNN2Model
+# NOTE: keep these as-is in your project.
+# -----------------------------
+# Request schema (HTTP)
+# -----------------------------
+class GenerateRequest(BaseModel):
+    prompt: str
+    size: str = Field(default="512x512", pattern=r"^\d+x\d+$")
+    num_inference_steps: int = 4
+    guidance_scale: float = 1.0
+    seed: int = 1234
+@dataclass
+class ModelPaths:
+    root: str  # args.i
+    @property
+    def scheduler_config(self) -> str:
+        return os.path.join(self.root, "scheduler/scheduler_config.json")
+    @property
+    def text_encoder(self) -> str:
+        return os.path.join(self.root, "text_encoder")
+    @property
+    def unet(self) -> str:
+        return os.path.join(self.root, "unet")
+    @property
+    def vae_decoder(self) -> str:
+        return os.path.join(self.root, "vae_decoder")
+@dataclass
+class Job:
+    req: GenerateRequest
+    fut: Future
+    submitted_at: float
+# -----------------------------
+# Pipeline Worker
+# -----------------------------
+class PipelineWorker:
+    """
+    Owns ONE pipeline instance. Run this in a dedicated thread.
+    """
+    def __init__(
+        self,
+        worker_id: int,
+        paths: ModelPaths,
+        scheduler: LCMScheduler,
+        tokenizer: CLIPTokenizer,
+        rknn_context_cfg: dict,
+    ):
+        self.worker_id = worker_id
+        self.paths = paths
+        self.scheduler = scheduler
+        self.tokenizer = tokenizer
+        self.rknn_context_cfg = rknn_context_cfg
+        self.pipe = None  # built in init()
+        self._init_pipeline()
+    def _init_pipeline(self):
+        # IMPORTANT: Each worker gets its *own* RKNN runtime context.
+        # You must map rknn_context_cfg to however your RKNN2Model supports it.
+        #
+        # Examples you might support in RKNN2Model:
+        #   RKNN2Model(path, core_mask=..., multi_context=True, device_id=..., ...)
+        #   RKNN2Model(path, runtime_options={...})
+        #
+        # Here: we pass **rknn_context_cfg as a flexible hook.
+        self.pipe = RKNN2LatentConsistencyPipeline(
+            text_encoder=RKNN2Model(self.paths.text_encoder, **self.rknn_context_cfg),
+            unet=RKNN2Model(self.paths.unet, **self.rknn_context_cfg),
+            vae_decoder=RKNN2Model(self.paths.vae_decoder, **self.rknn_context_cfg),
+            scheduler=self.scheduler,
+            tokenizer=self.tokenizer,
+        )
+    def run_job(self, job: Job) -> bytes:
+        h, w = (int(x) for x in job.req.size.split("x"))
+        # Deterministic per-request random generator
+        rng = np.random.RandomState(job.req.seed)
+        result = self.pipe(
+            prompt=job.req.prompt,
+            height=h,
+            width=w,
+            num_inference_steps=job.req.num_inference_steps,
+            guidance_scale=job.req.guidance_scale,
+            generator=rng,
+        )
+        pil_image = result["images"][0]
+        buf = io.BytesIO()
+        pil_image.save(buf, format="PNG")
+        return buf.getvalue()
+# -----------------------------
+# Singleton Service
+# -----------------------------
+class PipelineService:
+    """
+    Singleton-ish service that:
+      - loads scheduler/tokenizer once
+      - starts N worker threads
+      - provides a queued submit() API
+    """
+    _instance = None
+    _instance_lock = threading.Lock()
+    def __init__(
+        self,
+        paths: ModelPaths,
+        num_workers: int = 3,
+        queue_max: int = 64,
+        rknn_context_cfgs: Optional[List[dict]] = None,
+    ):
+        self.paths = paths
+        self.num_workers = num_workers
+        self.q: queue.Queue[Job] = queue.Queue(maxsize=queue_max)
+        # Load once (shared immutable objects)
+        with open(self.paths.scheduler_config, "r") as f:
+            scheduler_config = json.load(f)
+        self.scheduler = LCMScheduler.from_config(scheduler_config)
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
+        # Build per-worker RKNN context configs
+        # If not provided, create N identical configs with multi_context enabled.
+        if rknn_context_cfgs is None:
+            rknn_context_cfgs = [{"multi_context": True, "worker_id": i} for i in range(num_workers)]
+        if len(rknn_context_cfgs) != num_workers:
+            raise ValueError("rknn_context_cfgs must match num_workers length")
+        self.workers: List[PipelineWorker] = []
+        self.threads: List[threading.Thread] = []
+        self._stop = threading.Event()
+        # Create worker-owned pipelines
+        for i in range(num_workers):
+            worker = PipelineWorker(
+                worker_id=i,
+                paths=self.paths,
+                scheduler=self.scheduler,
+                tokenizer=self.tokenizer,
+                rknn_context_cfg=rknn_context_cfgs[i],
+            )
+            self.workers.append(worker)
+        # Start threads
+        for i in range(num_workers):
+            t = threading.Thread(target=self._worker_loop, args=(i,), daemon=True)
+            t.start()
+            self.threads.append(t)
+    @classmethod
+    def get_instance(
+        cls,
+        paths: ModelPaths,
+        num_workers: int = 3,
+        queue_max: int = 64,
+        rknn_context_cfgs: Optional[List[dict]] = None,
+    ) -> "PipelineService":
+        with cls._instance_lock:
+            if cls._instance is None:
+                cls._instance = cls(
+                    paths=paths,
+                    num_workers=num_workers,
+                    queue_max=queue_max,
+                    rknn_context_cfgs=rknn_context_cfgs,
+                )
+            return cls._instance
+    def shutdown(self):
+        self._stop.set()
+        # Optionally drain queue with errors
+        while True:
+            try:
+                job = self.q.get_nowait()
+            except queue.Empty:
+                break
+            if not job.fut.done():
+                job.fut.set_exception(RuntimeError("Service shutting down"))
+            self.q.task_done()
+    def submit(self, req: GenerateRequest, timeout_s: float = 0.5) -> Future:
+        fut: Future = Future()
+        job = Job(req=req, fut=fut, submitted_at=time.time())
+        try:
+            self.q.put(job, timeout=timeout_s)
+        except queue.Full:
+            fut.set_exception(RuntimeError("Queue full"))
+        return fut
+    def _worker_loop(self, worker_idx: int):
+        worker = self.workers[worker_idx]
+        while not self._stop.is_set():
+            try:
+                job = self.q.get(timeout=0.1)
+            except queue.Empty:
+                continue
+            if job.fut.cancelled():
+                self.q.task_done()
+                continue
+            try:
+                png = worker.run_job(job)
+                if not job.fut.done():
+                    job.fut.set_result(png)
+            except Exception as e:
+                if not job.fut.done():
+                    job.fut.set_exception(e)
+            finally:
+                self.q.task_done()
+# -----------------------------
+# RKNN multi-context configuration
+# -----------------------------
+def build_rknn_context_cfgs_for_rk3588(num_workers: int) -> List[dict]:
+    """
+    Plug this into your RKNN2Model wrapper.
+    Typical approach on RK3588:
+      - bind each worker to a different NPU core (0/1/2)
+      - enable multi_context so each model instance has its own runtime context
+    You must map these fields inside RKNN2Model.
+    """
+    core_masks = ["NPU_CORE_0", "NPU_CORE_1", "NPU_CORE_2"]
+    cfgs = []
+    for i in range(num_workers):
+        cfgs.append({
+            "multi_context": True,
+            '''"core_mask":   core_masks[i % len(core_masks)],'''
+            "core_mask": "NPU_CORE_AUTO",
+            "context_name": f"w{i}",
+            "worker_id": i,
+        })
+    return cfgs
+# -----------------------------
+# FastAPI server
+# -----------------------------
+app = FastAPI()
+# Configure these for your deployment
+MODEL_ROOT = os.environ.get("MODEL_ROOT", "/models/lcm_rknn")
+NUM_WORKERS = int(os.environ.get("NUM_WORKERS", "3"))
+QUEUE_MAX = int(os.environ.get("QUEUE_MAX", "64"))
+paths = ModelPaths(root=MODEL_ROOT)
+# Create singleton service at import time (fastest first request).
+service = PipelineService.get_instance(
+    paths=paths,
+    num_workers=NUM_WORKERS,
+    queue_max=QUEUE_MAX,
+    rknn_context_cfgs=build_rknn_context_cfgs_for_rk3588(NUM_WORKERS),
+)
+@app.post("/generate", responses={200: {"content": {"image/png": {}}}})
+def generate(req: GenerateRequest):
+    fut = service.submit(req, timeout_s=0.25)
+    try:
+        png_bytes = fut.result(timeout=120)  # you can tune this
+    except Exception as e:
+        msg = str(e)
+        if "Queue full" in msg:
+            raise HTTPException(status_code=429, detail="Too many requests (queue full). Try again.")
+        raise HTTPException(status_code=500, detail=f"Generation failed: {msg}")
+    return Response(
+        content=png_bytes,
+        media_type="image/png",
+        headers={
+            "Cache-Control": "no-store",
+        },
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=int(os.environ.get("PORT", "4200")),
+        log_config=None,   # <-- key
+    )

model/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.onnx_data filter=lfs diff=lfs merge=lfs -text

model/Assets/Icon.png ADDED Viewed

Git LFS Details

SHA256: d3074fce55454e2bbbd65ecf5bafafefa31397f2e22dc4eb1139c6cbd614955e
Pointer size: 131 Bytes
Size of remote file: 233 kB

model/Assets/LCM-Dreamshaper-V7-ONNX.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "Name": "Dreamshaper v7(LCM)",
+  "Description": "DreamShaper started as a model to have an alternative to MidJourney in the open source world.",
+  "Author": "TheyCallMeHex",
+  "Repository": "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX",
+  "ImageIcon": "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Icon.png",
+  "Status": "Active",
+  "PadTokenId": 49407,
+  "BlankTokenId": 49407,
+  "TokenizerLimit": 77,
+  "EmbeddingsLength": 768,
+  "ScaleFactor": 0.18215,
+  "PipelineType": "LatentConsistency",
+  "Diffusers": [
+    "TextToImage",
+    "ImageToImage",
+    "ImageInpaintLegacy"
+  ],
+  "ModelFiles": [
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/tokenizer/model.onnx",
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/unet/model.onnx",
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/unet/model.onnx_data",
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/text_encoder/model.onnx",
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/vae_decoder/model.onnx",
+	  "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/vae_encoder/model.onnx"
+  ],
+  "Images": [
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview1.png",
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview2.png",
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview3.png",
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview4.png",
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview5.png",
+    "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview6.png"
+  ]
+}

model/Assets/OnnxStack - 640x320.png ADDED Viewed

model/Assets/Preview1.png ADDED Viewed

Git LFS Details

SHA256: 43bf8ceccf682da0c3f60f21ef077b4a0602ba3aaaa1479ce81c80566697e0cc
Pointer size: 131 Bytes
Size of remote file: 447 kB

model/Assets/Preview2.png ADDED Viewed

Git LFS Details

SHA256: d5b535b6f0b99aa90da0e6fcc5175443a12ae4a4a4d2d8d315583a78ad727476
Pointer size: 131 Bytes
Size of remote file: 552 kB

model/Assets/Preview3.png ADDED Viewed

Git LFS Details

SHA256: c613380065f8d084d9fa4f7d575e77acec2fac1df5af69ebcd44d242dc2e55e1
Pointer size: 131 Bytes
Size of remote file: 478 kB

model/Assets/Preview4.png ADDED Viewed

Git LFS Details

SHA256: 08516dcfade0743d2d48cd999ec63640ebed690234e291f00d807cd14d077a57
Pointer size: 131 Bytes
Size of remote file: 454 kB

model/Assets/Preview5.png ADDED Viewed

Git LFS Details

SHA256: eb4cc72a470871800e6c62e23ad482684e5e7da8ba5c08f94f7f3e3c815e318e
Pointer size: 131 Bytes
Size of remote file: 473 kB

model/Assets/Preview6.png ADDED Viewed

Git LFS Details

SHA256: cb061284279fbac14d7a981d8220947dfc24724d1c7fa2cce193b41a2049b905
Pointer size: 131 Bytes
Size of remote file: 416 kB

model/Assets/lcm_angel_30_7.5_2092464983.png ADDED Viewed

Git LFS Details

SHA256: 2e12085a17ab0b75f952b6c02e269c93b3e4821b1dd3188c19a6174adea87a1a
Pointer size: 131 Bytes
Size of remote file: 399 kB

model/Assets/lcm_car_30_7.5_2092464983.png ADDED Viewed

Git LFS Details

SHA256: 578c327c2753351ebd4b53beceda66b84376701207896b930035d21874cb2d57
Pointer size: 131 Bytes
Size of remote file: 430 kB

model/Assets/lcm_demon_30_7.5_2092464983.png ADDED Viewed

Git LFS Details

SHA256: 808dc78b49f2e7b5af169b97b733ef5507d2fa7e721716b950f7d612dc8ea739
Pointer size: 131 Bytes
Size of remote file: 424 kB

model/Assets/lcm_ninja_30_7.5_2092464983.png ADDED Viewed

Git LFS Details

SHA256: 17cfea8ce05edc99bbc1b572b2c6e8089f2a8a3249eceb1c32520e4b0e9a5293
Pointer size: 131 Bytes
Size of remote file: 360 kB

model/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+---
+language:
+- en
+license: mit
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+inference: true
+---
+<p align="center" width="100%">
+    <img width="80%" src="Assets/OnnxStack - 640x320.png">
+</p>
+### OnnxStack
+This model has been converted to ONNX and tested with OnnxStack
+- [OnnxStack](https://github.com/saddam213/OnnxStack)
+### LCM Dreamshaper V7 Diffusion
+This model was converted to ONNX from LCM Dreamshaper V7
+- [LCM-Dreamshaper-V7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
+### Sample Images
+*A demon*
+<img src="Assets/lcm_demon_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
+     Seed: 207582124     GuidanceScale: 7.5     NumInferenceSteps: 30
+__________________________
+*An angel*
+<img src="Assets/lcm_angel_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
+     Seed: 207582124     GuidanceScale: 7.5     NumInferenceSteps: 30
+__________________________
+*A ninja*
+<img src="Assets/lcm_ninja_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
+     Seed: 207582124     GuidanceScale: 7.5     NumInferenceSteps: 30
+__________________________
+*a japanese dometic market sports car sitting in a showroom*
+<img src="Assets/lcm_car_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
+     Seed: 207582124     GuidanceScale: 7.5     NumInferenceSteps: 30
+__________________________

model/feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model/model_index.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.22.0.dev0",
+  "_name_or_path": "LCM_Dreamshaper_v7",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "LCMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

model/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "LCMScheduler",
+  "_diffusers_version": "0.22.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "dynamic_thresholding_ratio": 0.995,
+  "num_train_timesteps": 1000,
+  "original_inference_steps": 50,
+  "prediction_type": "epsilon",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": true,
+  "steps_offset": 1,
+  "thresholding": false,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}

model/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "LCM_Dreamshaper_v7\\text_encoder",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "vocab_size": 49408
+}

model/text_encoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fefe95eab6542e5fb7642b3f592489176836cc3fd49196b924a63760602c8c4a
+size 492588002

model/text_encoder/model.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbc35bda83249c243260c076efe73701c7aa278e6d693c67c5ec12c3019a0bd0
+size 249820005

model/tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52af50d264d702c351484aabf62c64abe61f59d6a6d2c508a3e797e23dc1e008
+size 1683168

model/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

model/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/unet/config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.22.0.dev0",
+  "_name_or_path": "LCM_Dreamshaper_v7\\unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 96,
+  "time_cond_proj_dim": 256,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": null,
+  "use_linear_projection": false
+}

model/unet/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3e9a08a3e5b943046bf90a513c492cf4c6e31e26229062af8eb4ad2ddf172b5
+size 1948508

model/unet/model.onnx_data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef99ccc336de0e79f247fcb3d1398b3f3d1a02796916b88a351d7a83f570a31a
+size 3438411520

model/unet/model.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f0272a526f993b04b888eaa0b8bd76875e6ccca302de0f8ae9587bd48de18a
+size 1809543921

model/vae_decoder/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.22.0.dev0",
+  "_name_or_path": "LCM_Dreamshaper_v7\\vae",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

model/vae_decoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec5298d7bfa592d492d36b42d17f794fcdb9175e2aac366956d40f3f38d13ca1
+size 198078038

model/vae_decoder/model.rknn ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24d8f23bef086b9aac6393d2b56da5582b0ab4307ea6e1376c7aa7052288d6cd
+size 295036220

model/vae_encoder/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.22.0.dev0",
+  "_name_or_path": "LCM_Dreamshaper_v7\\vae",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

model/vae_encoder/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:268d4398021d7bc91e91c94e4835cc5ffa471155db1b722d0a43f6d1a4f822fd
+size 136760154

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi==0.115.0
+uvicorn==0.30.6
+pydantic==2.8.2
+numpy
+pillow
+transformers
+diffusers
+rknn-toolkit-lite2==2.3.2

rknnlcm.py ADDED Viewed

	@@ -0,0 +1,682 @@

+import argparse
+import json
+import time
+import PIL
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import (
+    LCMScheduler
+)
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+import numpy as np
+import os
+import torch  # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+from typing import Callable, List, Optional, Union, Tuple
+from PIL import Image
+from rknnlite.api import RKNNLite
+import os
+import json
+import time
+from typing import List, Any, Optional, Union
+import numpy as np
+from rknnlite.api import RKNNLite
+class RKNN2Model:
+    """Wrapper for running RKNPU2 (RKNNLite) models"""
+    def __init__(
+        self,
+        model_dir: str,
+        *,
+        core_mask: Optional[Union[str, int]] = None,
+        multi_context: bool = True,
+        data_format: str = "nchw",
+        verbose_shapes: bool = False,
+        runtime_kwargs: Optional[dict] = None,
+        **_ignored: Any,
+    ):
+        """
+        Params are designed to match the pipeline service pattern:
+          RKNN2Model(path, **rknn_context_cfg)
+        - core_mask: can be None (defaults), string ("NPU_CORE_0"/"NPU_CORE_1"/"NPU_CORE_2"/"NPU_CORE_AUTO"),
+                    or RKNNLite constant/int if you pass it directly.
+          NOTE: you said multi-core causes kernel crash; leave default to AUTO.
+        - multi_context: kept for compatibility with pool patterns. This class is already per-instance.
+        - data_format: passed to inference (default "nchw")
+        - verbose_shapes: log/print input/output shapes (disable for server)
+        - runtime_kwargs: optional extra kwargs to pass into init_runtime(...)
+        - **_ignored: allows you to pass context_name/worker_id etc without breaking
+        """
+        self.model_dir = model_dir
+        self.data_format = data_format
+        self.verbose_shapes = verbose_shapes
+        self.multi_context = multi_context
+        self.runtime_kwargs = runtime_kwargs or {}
+        logger.info(f"Loading {model_dir}")
+        start = time.time()
+        cfg_path = os.path.join(model_dir, "config.json")
+        rknn_path = os.path.join(model_dir, "model.rknn")
+        if not (os.path.exists(model_dir) and os.path.exists(rknn_path)):
+            raise FileNotFoundError(f"Missing model dir or model.rknn: {model_dir}")
+        self.config = json.load(open(cfg_path, "r"))
+        self.rknnlite = RKNNLite()
+        self.rknnlite.load_rknn(rknn_path)
+        # Resolve core mask
+        resolved_core_mask = self._resolve_core_mask(core_mask)
+        # IMPORTANT: Use AUTO by default because you noted multi-core can crash.
+        # If you later confirm stability, pass core_mask="NPU_CORE_0"/"NPU_CORE_1"/"NPU_CORE_2" per worker.
+        self.rknnlite.init_runtime(core_mask=resolved_core_mask, **self.runtime_kwargs)
+        load_time = time.time() - start
+        logger.info(f"Done loading {model_dir}. Took {load_time:.1f} seconds.")
+        self.modelname = os.path.basename(model_dir.rstrip("/"))
+        self.inference_time = 0
+    def _resolve_core_mask(self, core_mask: Optional[Union[str, int]]) -> int:
+        if core_mask is None:
+            return RKNNLite.NPU_CORE_AUTO
+        # Allow passing RKNNLite constant directly
+        if isinstance(core_mask, int):
+            return core_mask
+        # Allow passing names
+        if isinstance(core_mask, str):
+            key = core_mask.strip().upper()
+            mapping = {
+                "NPU_CORE_AUTO": RKNNLite.NPU_CORE_AUTO,
+                "NPU_CORE_0": RKNNLite.NPU_CORE_0,
+                "NPU_CORE_1": RKNNLite.NPU_CORE_1,
+                "NPU_CORE_2": RKNNLite.NPU_CORE_2,
+                # Some people write these:
+                "AUTO": RKNNLite.NPU_CORE_AUTO,
+                "0": RKNNLite.NPU_CORE_0,
+                "1": RKNNLite.NPU_CORE_1,
+                "2": RKNNLite.NPU_CORE_2,
+            }
+            if key not in mapping:
+                raise ValueError(f"Unknown core_mask string: {core_mask!r}")
+            return mapping[key]
+        raise TypeError(f"core_mask must be None, int, or str; got {type(core_mask)}")
+    def __call__(self, **kwargs) -> List[np.ndarray]:
+        # Preserve order of kwargs values as given by caller
+        input_list = list(kwargs.values())
+        if self.verbose_shapes:
+            for i, arr in enumerate(input_list):
+                if isinstance(arr, np.ndarray):
+                    logger.info(f"[{self.modelname}] input[{i}] shape={arr.shape} dtype={arr.dtype}")
+        results = self.rknnlite.inference(inputs=input_list, data_format=self.data_format)
+        if self.verbose_shapes:
+            for j, res in enumerate(results):
+                if isinstance(res, np.ndarray):
+                    logger.info(f"[{self.modelname}] output[{j}] shape={res.shape} dtype={res.dtype}")
+        return results
+class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
+    def __init__(
+            self,
+            text_encoder: RKNN2Model,
+            unet: RKNN2Model,
+            vae_decoder: RKNN2Model,
+            scheduler: LCMScheduler,
+            tokenizer: CLIPTokenizer,
+            force_zeros_for_empty_prompt: Optional[bool] = True,
+            feature_extractor: Optional[CLIPFeatureExtractor] = None,
+            text_encoder_2: Optional[RKNN2Model] = None,
+            tokenizer_2: Optional[CLIPTokenizer] = None
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
+        self.safety_checker = None
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.tokenizer_2 = tokenizer_2
+        self.unet = unet
+        self.vae_decoder = vae_decoder
+        VAE_DECODER_UPSAMPLE_FACTOR = 8
+        self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
+    @staticmethod
+    def postprocess(
+        image: np.ndarray,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        ):
+        def numpy_to_pil(images: np.ndarray):
+            """
+            Convert a numpy image or a batch of images to a PIL image.
+            """
+            if images.ndim == 3:
+                images = images[None, ...]
+            images = (images * 255).round().astype("uint8")
+            if images.shape[-1] == 1:
+                # special case for grayscale (single channel) images
+                pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+            else:
+                pil_images = [Image.fromarray(image) for image in images]
+            return pil_images
+        def denormalize(images: np.ndarray):
+            """
+            Denormalize an image array to [0,1].
+            """
+            return np.clip(images / 2 + 0.5, 0, 1)
+        if not isinstance(image, np.ndarray):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
+            )
+        if output_type not in ["latent", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            logger.warning(deprecation_message)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            raise ValueError("do_denormalize is required for postprocessing")
+        image = np.stack(
+            [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
+        )
+        image = image.transpose((0, 2, 3, 1))
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+        return image
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, list]],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+        return latents
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = "",
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        original_inference_steps: int = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+        # Don't need to get negative prompts due to LCM guided distillation
+        negative_prompt = None
+        negative_prompt_embeds = None
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if generator is None:
+            generator = np.random.RandomState()
+        start_time = time.time()
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            False,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        encode_prompt_time = time.time() - start_time
+        print(f"Prompt encoding time: {encode_prompt_time:.2f}s")
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
+        timesteps = self.scheduler.timesteps
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config["in_channels"],
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+        # get Guidance Scale Embedding
+        w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
+        w_embedding = self.get_guidance_scale_embedding(
+            w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
+        )
+        # Adapted from diffusers to extend it for other runtimes than ORT
+        timestep_dtype = np.int64
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        inference_start = time.time()
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(
+                sample=latents,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=w_embedding,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, denoised = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
+            )
+            latents, denoised = latents.numpy(), denoised.numpy()
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+        inference_time = time.time() - inference_start
+        print(f"Inference time: {inference_time:.2f}s")
+        decode_start = time.time()
+        if output_type == "latent":
+            image = denoised
+            has_nsfw_concept = None
+        else:
+            denoised /= self.vae_decoder.config["scaling_factor"]
+            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+            image = np.concatenate(
+                [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
+            )
+            # image, has_nsfw_concept = self.run_safety_checker(image)
+            has_nsfw_concept = None  # skip safety checker
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        decode_time = time.time() - decode_start
+        print(f"Decode time: {decode_time:.2f}s")
+        total_time = encode_prompt_time + inference_time + decode_time
+        print(f"Total time: {total_time:.2f}s")
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        w = w * 1000
+        half_dim = embedding_dim // 2
+        emb = np.log(10000.0) / (half_dim - 1)
+        emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
+        emb = w[:, None] * emb[None, :]
+        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = np.pad(emb, [(0, 0), (0, 1)])
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+def get_image_path(args, **override_kwargs):
+    """ mkdir output folder and encode metadata in the filename
+    """
+    out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
+    os.makedirs(out_folder, exist_ok=True)
+    out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
+    out_fname += f"_LCM_"
+    out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
+    return os.path.join(out_folder, out_fname + ".png")
+def prepare_controlnet_cond(image_path, height, width):
+    image = Image.open(image_path).convert("RGB")
+    image = image.resize((height, width), resample=Image.LANCZOS)
+    image = np.array(image).transpose(2, 0, 1) / 255.0
+    return image
+#args.prompt seed=4234924 i=model_path o=output_path size=256x256 num_inference_steps guidance_scale
+def generate_png_bytes(args):
+    logger.info(f"Setting random seed to {args.seed}")
+    scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
+    with open(scheduler_config_path, "r") as f:
+        scheduler_config = json.load(f)
+    user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
+    pipe = RKNN2LatentConsistencyPipeline(
+        text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
+        unet=RKNN2Model(os.path.join(args.i, "unet")),
+        vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
+        scheduler=user_specified_scheduler,
+        tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
+    )
+    logger.info("Beginning image generation.")
+    result = pipe(
+        prompt=args.prompt,
+        height=int(args.size.split("x")[0]),
+        width=int(args.size.split("x")[1]),
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        generator=np.random.RandomState(args.seed),
+    )
+    pil_image = result["images"][0]
+    # Convert to PNG bytes
+    buf = io.BytesIO()
+    pil_image.save(buf, format="PNG")
+    buf.seek(0)
+    return buf.getvalue()

run_onnx-lcm.py ADDED Viewed

	@@ -0,0 +1,665 @@

+import argparse
+import json
+import time
+import PIL
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import (
+    LCMScheduler
+)
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+import gc
+import inspect
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+import numpy as np
+import os
+import torch  # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+from typing import Callable, List, Optional, Union, Tuple
+from PIL import Image
+# from rknnlite.api import RKNNLite
+# class RKNN2Model:
+#     """ Wrapper for running RKNPU2 models """
+#     def __init__(self, model_path):
+#         logger.info(f"Loading {model_path}")
+#         start = time.time()
+#         assert os.path.exists(model_path) and model_path.endswith(".rknn")
+#         self.rknnlite = RKNNLite()
+#         self.rknnlite.load_rknn(model_path)
+#         self.rknnlite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO) # Multi-core will cause kernel crash
+#         load_time = time.time() - start
+#         logger.info(f"Done. Took {load_time:.1f} seconds.")
+#         self.modelname = model_path.split("/")[-1]
+#         self.inference_time = 0
+#     def __call__(self, **kwargs) -> List[np.ndarray]:
+#         np.savez(f"{self.modelname}_input_{self.inference_time}.npz", **kwargs)
+#         #print(kwargs)
+#         input_list = [value for key, value in kwargs.items()]
+#         for i, input in enumerate(input_list):
+#             if isinstance(input, np.ndarray):
+#                 print(f"input {i} shape: {input.shape}")
+#         results = self.rknnlite.inference(inputs=input_list)
+#         for res in results:
+#             print(f"output shape: {res.shape}")
+#         return results
+import onnxruntime as ort
+class RKNN2Model:
+    """ Wrapper for running ONNX models """
+    def __init__(self, model_dir):
+        logger.info(f"Loading {model_dir}")
+        start = time.time()
+        self.config = json.load(open(os.path.join(model_dir, "config.json")))
+        assert os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "model.onnx"))
+        self.session = ort.InferenceSession(os.path.join(model_dir, "model.onnx"))
+        load_time = time.time() - start
+        logger.info(f"Done. Took {load_time:.1f} seconds.")
+        self.modelname = model_dir.split("/")[-1]
+        self.inference_time = 0
+    def __call__(self, **kwargs) -> List[np.ndarray]:
+        # np.savez(f"onnx_out/{self.modelname}_input_{self.inference_time}.npz", **kwargs)
+        self.inference_time += 1
+        results = self.session.run(None, kwargs)
+        results_list = []
+        for res in results:
+            results_list.append(res)
+        return results
+class RKNN2StableDiffusionPipeline(DiffusionPipeline):
+    """ RKNN2 version of
+    `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline`
+    """
+    def __init__(
+            self,
+            text_encoder: RKNN2Model,
+            unet: RKNN2Model,
+            vae_decoder: RKNN2Model,
+            scheduler: LCMScheduler,
+            tokenizer: CLIPTokenizer,
+            force_zeros_for_empty_prompt: Optional[bool] = True,
+            feature_extractor: Optional[CLIPFeatureExtractor] = None,
+            text_encoder_2: Optional[RKNN2Model] = None,
+            tokenizer_2: Optional[CLIPTokenizer] = None
+    ):
+        super().__init__()
+        # Register non-Core ML components of the pipeline similar to the original pipeline
+        self.register_modules(
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
+        self.safety_checker = None
+        # Register Core ML components of the pipeline
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.tokenizer_2 = tokenizer_2
+        self.unet = unet
+        self.vae_decoder = vae_decoder
+        VAE_DECODER_UPSAMPLE_FACTOR = 8
+        # In PyTorch, users can determine the tensor shapes dynamically by default
+        # In CoreML, tensors have static shapes unless flexible shapes were used during export
+        # See https://coremltools.readme.io/docs/flexible-inputs
+        latent_h, latent_w = 32, 32  # hallo1: FIXME: hardcoded value
+        self.height = latent_h * VAE_DECODER_UPSAMPLE_FACTOR
+        self.width = latent_w * VAE_DECODER_UPSAMPLE_FACTOR
+        self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
+        logger.info(
+            f"Stable Diffusion configured to generate {self.height}x{self.width} images"
+        )
+    @staticmethod
+    def postprocess(
+        image: np.ndarray,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        ):
+        def numpy_to_pil(images: np.ndarray):
+            """
+            Convert a numpy image or a batch of images to a PIL image.
+            """
+            if images.ndim == 3:
+                images = images[None, ...]
+            images = (images * 255).round().astype("uint8")
+            if images.shape[-1] == 1:
+                # special case for grayscale (single channel) images
+                pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+            else:
+                pil_images = [Image.fromarray(image) for image in images]
+            return pil_images
+        def denormalize(images: np.ndarray):
+            """
+            Denormalize an image array to [0,1].
+            """
+            return np.clip(images / 2 + 0.5, 0, 1)
+        if not isinstance(image, np.ndarray):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
+            )
+        if output_type not in ["latent", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            logger.warning(deprecation_message)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            raise ValueError("do_denormalize is required for postprocessing")
+        image = np.stack(
+            [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
+        )
+        image = image.transpose((0, 2, 3, 1))
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+        return image
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, list]],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+        return latents
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = "",
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        original_inference_steps: int = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+        # Don't need to get negative prompts due to LCM guided distillation
+        negative_prompt = None
+        negative_prompt_embeds = None
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if generator is None:
+            generator = np.random.RandomState()
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            False,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
+        timesteps = self.scheduler.timesteps
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config["in_channels"],
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+        # get Guidance Scale Embedding
+        w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
+        w_embedding = self.get_guidance_scale_embedding(
+            w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
+        )
+        # Adapted from diffusers to extend it for other runtimes than ORT
+        timestep_dtype = np.int64
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(
+                sample=latents,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=w_embedding,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, denoised = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
+            )
+            latents, denoised = latents.numpy(), denoised.numpy()
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+        if output_type == "latent":
+            image = denoised
+            has_nsfw_concept = None
+        else:
+            denoised /= self.vae_decoder.config["scaling_factor"]
+            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+            image = np.concatenate(
+                [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
+            )
+            # image, has_nsfw_concept = self.run_safety_checker(image)
+            has_nsfw_concept = None  # skip safety checker
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        w = w * 1000
+        half_dim = embedding_dim // 2
+        emb = np.log(10000.0) / (half_dim - 1)
+        emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
+        emb = w[:, None] * emb[None, :]
+        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = np.pad(emb, [(0, 0), (0, 1)])
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+def get_image_path(args, **override_kwargs):
+    """ mkdir output folder and encode metadata in the filename
+    """
+    out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
+    os.makedirs(out_folder, exist_ok=True)
+    out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
+    out_fname += f"_LCM_"
+    out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
+    out_fname += "_onnx_"
+    return os.path.join(out_folder, out_fname + ".png")
+def prepare_controlnet_cond(image_path, height, width):
+    image = Image.open(image_path).convert("RGB")
+    image = image.resize((height, width), resample=Image.LANCZOS)
+    image = np.array(image).transpose(2, 0, 1) / 255.0
+    return image
+def main(args):
+    logger.info(f"Setting random seed to {args.seed}")
+    # load scheduler from /scheduler/scheduler_config.json
+    scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
+    with open(scheduler_config_path, "r") as f:
+        scheduler_config = json.load(f)
+    user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
+    print("user_specified_scheduler", user_specified_scheduler)
+    pipe = RKNN2StableDiffusionPipeline(
+        text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
+        unet=RKNN2Model(os.path.join(args.i, "unet")),
+        vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
+        scheduler=user_specified_scheduler,
+        tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
+    )
+    logger.info("Beginning image generation.")
+    image = pipe(
+        prompt=args.prompt,
+        height=int(args.size.split("x")[0]),
+        width=int(args.size.split("x")[1]),
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        generator=np.random.RandomState(args.seed),
+    )
+    out_path = get_image_path(args)
+    logger.info(f"Saving generated image to {out_path}")
+    image["images"][0].save(out_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="The text prompt to be used for text-to-image generation.")
+    parser.add_argument(
+        "-i",
+        required=True,
+        help=("Path to model directory"))
+    parser.add_argument("-o", required=True)
+    parser.add_argument("--seed",
+                        default=93,
+                        type=int,
+                        help="Random seed to be able to reproduce results")
+    parser.add_argument(
+        "-s",
+        "--size",
+        default="256x256",
+        type=str,
+        help="Image size")
+    parser.add_argument(
+        "--num-inference-steps",
+        default=4,
+        type=int,
+        help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
+    parser.add_argument(
+        "--guidance-scale",
+        default=7.5,
+        type=float,
+        help="Controls the influence of the text prompt on sampling process (0=random images)")
+    args = parser.parse_args()
+    main(args)

run_rknn-lcm.py ADDED Viewed

	@@ -0,0 +1,632 @@

+import argparse
+import json
+import time
+import PIL
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import (
+    LCMScheduler
+)
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+import numpy as np
+import os
+import torch  # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+from typing import Callable, List, Optional, Union, Tuple
+from PIL import Image
+from rknnlite.api import RKNNLite
+class RKNN2Model:
+    """ Wrapper for running RKNPU2 models """
+    def __init__(self, model_dir):
+        logger.info(f"Loading {model_dir}")
+        start = time.time()
+        self.config = json.load(open(os.path.join(model_dir, "config.json")))
+        assert os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "model.rknn"))
+        self.rknnlite = RKNNLite()
+        self.rknnlite.load_rknn(os.path.join(model_dir, "model.rknn"))
+        self.rknnlite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO) # Multi-core will cause kernel crash
+        load_time = time.time() - start
+        logger.info(f"Done. Took {load_time:.1f} seconds.")
+        self.modelname = model_dir.split("/")[-1]
+        self.inference_time = 0
+    def __call__(self, **kwargs) -> List[np.ndarray]:
+        # np.savez(f"rknn_out/{self.modelname}_input_{self.inference_time}.npz", **kwargs)
+        # self.inference_time += 1
+        #print(kwargs)
+        input_list = [value for key, value in kwargs.items()]
+        for i, input in enumerate(input_list):
+            if isinstance(input, np.ndarray):
+                print(f"input {i} shape: {input.shape}")
+        results = self.rknnlite.inference(inputs=input_list, data_format='nchw')
+        for res in results:
+            print(f"output shape: {res.shape}")
+        return results
+class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
+    def __init__(
+            self,
+            text_encoder: RKNN2Model,
+            unet: RKNN2Model,
+            vae_decoder: RKNN2Model,
+            scheduler: LCMScheduler,
+            tokenizer: CLIPTokenizer,
+            force_zeros_for_empty_prompt: Optional[bool] = True,
+            feature_extractor: Optional[CLIPFeatureExtractor] = None,
+            text_encoder_2: Optional[RKNN2Model] = None,
+            tokenizer_2: Optional[CLIPTokenizer] = None
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
+        self.safety_checker = None
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.tokenizer_2 = tokenizer_2
+        self.unet = unet
+        self.vae_decoder = vae_decoder
+        VAE_DECODER_UPSAMPLE_FACTOR = 8
+        self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
+    @staticmethod
+    def postprocess(
+        image: np.ndarray,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+        ):
+        def numpy_to_pil(images: np.ndarray):
+            """
+            Convert a numpy image or a batch of images to a PIL image.
+            """
+            if images.ndim == 3:
+                images = images[None, ...]
+            images = (images * 255).round().astype("uint8")
+            if images.shape[-1] == 1:
+                # special case for grayscale (single channel) images
+                pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+            else:
+                pil_images = [Image.fromarray(image) for image in images]
+            return pil_images
+        def denormalize(images: np.ndarray):
+            """
+            Denormalize an image array to [0,1].
+            """
+            return np.clip(images / 2 + 0.5, 0, 1)
+        if not isinstance(image, np.ndarray):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
+            )
+        if output_type not in ["latent", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            logger.warning(deprecation_message)
+            output_type = "np"
+        if output_type == "latent":
+            return image
+        if do_denormalize is None:
+            raise ValueError("do_denormalize is required for postprocessing")
+        image = np.stack(
+            [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
+        )
+        image = image.transpose((0, 2, 3, 1))
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+        return image
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, list]],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds
+    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            if isinstance(generator, np.random.RandomState):
+                latents = generator.randn(*shape).astype(dtype)
+            elif isinstance(generator, torch.Generator):
+                latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
+            else:
+                raise ValueError(
+                    f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
+                    f" {type(generator)}."
+                )
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+        return latents
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = "",
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        original_inference_steps: int = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+        # Don't need to get negative prompts due to LCM guided distillation
+        negative_prompt = None
+        negative_prompt_embeds = None
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if generator is None:
+            generator = np.random.RandomState()
+        start_time = time.time()
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            False,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        encode_prompt_time = time.time() - start_time
+        print(f"Prompt encoding time: {encode_prompt_time:.2f}s")
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
+        timesteps = self.scheduler.timesteps
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config["in_channels"],
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+        # get Guidance Scale Embedding
+        w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
+        w_embedding = self.get_guidance_scale_embedding(
+            w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
+        )
+        # Adapted from diffusers to extend it for other runtimes than ORT
+        timestep_dtype = np.int64
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        inference_start = time.time()
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(
+                sample=latents,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=w_embedding,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, denoised = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
+            )
+            latents, denoised = latents.numpy(), denoised.numpy()
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+        inference_time = time.time() - inference_start
+        print(f"Inference time: {inference_time:.2f}s")
+        decode_start = time.time()
+        if output_type == "latent":
+            image = denoised
+            has_nsfw_concept = None
+        else:
+            denoised /= self.vae_decoder.config["scaling_factor"]
+            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+            image = np.concatenate(
+                [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
+            )
+            # image, has_nsfw_concept = self.run_safety_checker(image)
+            has_nsfw_concept = None  # skip safety checker
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        decode_time = time.time() - decode_start
+        print(f"Decode time: {decode_time:.2f}s")
+        total_time = encode_prompt_time + inference_time + decode_time
+        print(f"Total time: {total_time:.2f}s")
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        w = w * 1000
+        half_dim = embedding_dim // 2
+        emb = np.log(10000.0) / (half_dim - 1)
+        emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
+        emb = w[:, None] * emb[None, :]
+        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = np.pad(emb, [(0, 0), (0, 1)])
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+def get_image_path(args, **override_kwargs):
+    """ mkdir output folder and encode metadata in the filename
+    """
+    out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
+    os.makedirs(out_folder, exist_ok=True)
+    out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
+    out_fname += f"_LCM_"
+    out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
+    return os.path.join(out_folder, out_fname + ".png")
+def prepare_controlnet_cond(image_path, height, width):
+    image = Image.open(image_path).convert("RGB")
+    image = image.resize((height, width), resample=Image.LANCZOS)
+    image = np.array(image).transpose(2, 0, 1) / 255.0
+    return image
+def main(args):
+    logger.info(f"Setting random seed to {args.seed}")
+    # load scheduler from /scheduler/scheduler_config.json
+    scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
+    with open(scheduler_config_path, "r") as f:
+        scheduler_config = json.load(f)
+    user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
+    print("user_specified_scheduler", user_specified_scheduler)
+    pipe = RKNN2LatentConsistencyPipeline(
+        text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
+        unet=RKNN2Model(os.path.join(args.i, "unet")),
+        vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
+        scheduler=user_specified_scheduler,
+        tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
+    )
+    logger.info("Beginning image generation.")
+    image = pipe(
+        prompt=args.prompt,
+        height=int(args.size.split("x")[0]),
+        width=int(args.size.split("x")[1]),
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        generator=np.random.RandomState(args.seed),
+    )
+    out_path = get_image_path(args)
+    logger.info(f"Saving generated image to {out_path}")
+    image["images"][0].save(out_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="The text prompt to be used for text-to-image generation.")
+    parser.add_argument(
+        "-i",
+        required=True,
+        help=("Path to model directory"))
+    parser.add_argument("-o", required=True)
+    parser.add_argument("--seed",
+                        default=93,
+                        type=int,
+                        help="Random seed to be able to reproduce results")
+    parser.add_argument(
+        "-s",
+        "--size",
+        default="256x256",
+        type=str,
+        help="Image size")
+    parser.add_argument(
+        "--num-inference-steps",
+        default=4,
+        type=int,
+        help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
+    parser.add_argument(
+        "--guidance-scale",
+        default=7.5,
+        type=float,
+        help="Controls the influence of the text prompt on sampling process (0=random images)")
+    args = parser.parse_args()
+    main(args)

runner.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+docker run --rm -it \
+  --name lcm-sd \
+  --network host \
+  --privileged \
+  -e PORT=4200 \
+  -e NUM_WORKERS=1 \
+  -e QUEUE_MAX=8 \
+  -e MODEL_ROOT=/models \
+  -v "$PWD/model:/models:ro" \
+  lcm-sd