darkbit1001 commited on
Commit
be0e7db
·
1 Parent(s): c40da80

refactor for lcm server using uvicorn

Browse files
Files changed (47) hide show
  1. .dockerignore +20 -0
  2. .gitattributes +15 -0
  3. .gitignore +19 -0
  4. Dockerfile +32 -0
  5. README.md +77 -0
  6. convert-onnx-to-rknn.py +120 -0
  7. lcm_server.py +316 -0
  8. model/.gitattributes +36 -0
  9. model/Assets/Icon.png +3 -0
  10. model/Assets/LCM-Dreamshaper-V7-ONNX.json +35 -0
  11. model/Assets/OnnxStack - 640x320.png +0 -0
  12. model/Assets/Preview1.png +3 -0
  13. model/Assets/Preview2.png +3 -0
  14. model/Assets/Preview3.png +3 -0
  15. model/Assets/Preview4.png +3 -0
  16. model/Assets/Preview5.png +3 -0
  17. model/Assets/Preview6.png +3 -0
  18. model/Assets/lcm_angel_30_7.5_2092464983.png +3 -0
  19. model/Assets/lcm_car_30_7.5_2092464983.png +3 -0
  20. model/Assets/lcm_demon_30_7.5_2092464983.png +3 -0
  21. model/Assets/lcm_ninja_30_7.5_2092464983.png +3 -0
  22. model/README.md +56 -0
  23. model/feature_extractor/preprocessor_config.json +28 -0
  24. model/model_index.json +34 -0
  25. model/scheduler/scheduler_config.json +20 -0
  26. model/text_encoder/config.json +25 -0
  27. model/text_encoder/model.onnx +3 -0
  28. model/text_encoder/model.rknn +3 -0
  29. model/tokenizer/merges.txt +0 -0
  30. model/tokenizer/model.onnx +3 -0
  31. model/tokenizer/special_tokens_map.json +30 -0
  32. model/tokenizer/tokenizer_config.json +31 -0
  33. model/tokenizer/vocab.json +0 -0
  34. model/unet/config.json +68 -0
  35. model/unet/model.onnx +3 -0
  36. model/unet/model.onnx_data +3 -0
  37. model/unet/model.rknn +3 -0
  38. model/vae_decoder/config.json +32 -0
  39. model/vae_decoder/model.onnx +3 -0
  40. model/vae_decoder/model.rknn +3 -0
  41. model/vae_encoder/config.json +32 -0
  42. model/vae_encoder/model.onnx +3 -0
  43. requirements.txt +8 -0
  44. rknnlcm.py +682 -0
  45. run_onnx-lcm.py +665 -0
  46. run_rknn-lcm.py +632 -0
  47. runner.sh +10 -0
.dockerignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .git/*
3
+ model
4
+ model/*
5
+ __pycache__
6
+ **/__pycache__
7
+
8
+ images
9
+ images/*
10
+
11
+ .aider*
12
+
13
+ *.pyc
14
+ *.pyo
15
+ *.pyd
16
+ .env
17
+ .venv
18
+ venv
19
+ dist
20
+ build
.gitattributes CHANGED
@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/text_encoder/model.rknn filter=lfs diff=lfs merge=lfs -text
37
+ model/unet/model.onnx_data filter=lfs diff=lfs merge=lfs -text
38
+ model/unet/model.rknn filter=lfs diff=lfs merge=lfs -text
39
+ model/vae_decoder/model.rknn filter=lfs diff=lfs merge=lfs -text
40
+ model/Assets/Icon.png filter=lfs diff=lfs merge=lfs -text
41
+ model/Assets/Preview1.png filter=lfs diff=lfs merge=lfs -text
42
+ model/Assets/Preview2.png filter=lfs diff=lfs merge=lfs -text
43
+ model/Assets/Preview3.png filter=lfs diff=lfs merge=lfs -text
44
+ model/Assets/Preview4.png filter=lfs diff=lfs merge=lfs -text
45
+ model/Assets/Preview5.png filter=lfs diff=lfs merge=lfs -text
46
+ model/Assets/Preview6.png filter=lfs diff=lfs merge=lfs -text
47
+ model/Assets/lcm_angel_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
48
+ model/Assets/lcm_car_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
49
+ model/Assets/lcm_demon_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
50
+ model/Assets/lcm_ninja_30_7.5_2092464983.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @eaDir/
2
+ .DS_Store
3
+ __pycache__
4
+ **/__pycache__
5
+
6
+ images
7
+ images/*
8
+
9
+ .aider*
10
+
11
+ *.pyc
12
+ *.pyo
13
+ *.pyd
14
+ .env
15
+ .venv
16
+ venv
17
+ dist
18
+ build
19
+
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # System deps (Pillow, RKNN, etc.)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ libglib2.0-0 \
6
+ libsm6 \
7
+ libxext6 \
8
+ libxrender1 \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ WORKDIR /app
12
+
13
+ # Python deps
14
+ COPY requirements.txt /app/requirements.txt
15
+ RUN python -m venv /venv && \
16
+ /venv/bin/pip install --no-cache-dir -U pip setuptools wheel && \
17
+ /venv/bin/pip install --no-cache-dir -r /app/requirements.txt
18
+
19
+ # 👇 THIS copies *everything except what .dockerignore excludes*
20
+ COPY . /app
21
+
22
+ ENV PATH="/venv/bin:$PATH"
23
+ ENV PYTHONUNBUFFERED=1
24
+
25
+ ENV PORT=4200
26
+ ENV NUM_WORKERS=1
27
+ ENV QUEUE_MAX=8
28
+ ENV MODEL_ROOT=/models
29
+
30
+ EXPOSE 4200
31
+
32
+ CMD ["uvicorn", "lcm_server:app", "--host", "0.0.0.0", "--port", "4200", "--no-access-log"]
README.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - TheyCallMeHex/LCM-Dreamshaper-V7-ONNX
4
+ tags:
5
+ - rknn
6
+ - LCM
7
+ - stable-diffusion
8
+ ---
9
+ # Stable Diffusion 1.5 Latent Consistency Model for RKNN2
10
+
11
+ Run the Stable Diffusion 1.5 LCM image generation model using RKNPU2!
12
+
13
+ - Inference speed (RK3588, single NPU core):
14
+ - 384x384: Text encoder 0.05s + U-Net 2.36s/it + VAE Decoder 5.48s
15
+ - 512x512: Text encoder 0.05s + U-Net 5.65s/it + VAE Decoder 11.13s
16
+ - Memory usage:
17
+ - 384x384: About 5.2GB
18
+ - 512x512: About 5.6GB
19
+
20
+ ## Usage
21
+
22
+ ### 1. Clone or download this repository to your local machine
23
+
24
+ ### 2. Install dependencies
25
+
26
+ ```bash
27
+ pip install diffusers pillow numpy<2 rknn-toolkit-lite2
28
+ ```
29
+
30
+ ### 3. Run
31
+
32
+ ```bash
33
+ python ./run_rknn-lcm.py -i ./model -o ./images --num-inference-steps 4 -s 512x512 --prompt "Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style."
34
+ ```
35
+
36
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/6319d0860d7478ae0069cd92/50jwBxv0Edf7x0WoHmpwi.png)
37
+
38
+
39
+ ## Model Conversion
40
+
41
+ ### Install dependencies
42
+
43
+ ```bash
44
+ pip install diffusers pillow numpy<2 rknn-toolkit2
45
+ ```
46
+
47
+ ### 1. Download the model
48
+
49
+ Download a Stable Diffusion 1.5 LCM model in ONNX format and place it in the `./model` directory.
50
+
51
+ ```bash
52
+ huggingface-cli download TheyCallMeHex/LCM-Dreamshaper-V7-ONNX
53
+ cp -r -L ~/.cache/huggingface/hub/models--TheyCallMeHex--LCM-Dreamshaper-V7-ONNX/snapshots/4029a217f9cdc0437f395738d3ab686bb910ceea ./model
54
+ ```
55
+
56
+ In theory, you could also achieve LCM inference by merging the LCM Lora into a regular Stable Diffusion 1.5 model and then converting it to ONNX format. However, I'm not sure how to do this. If anyone knows, please feel free to submit a PR.
57
+
58
+ ### 2. Convert the model
59
+
60
+ ```bash
61
+ # Convert the model, 384x384 resolution
62
+ python ./convert-onnx-to-rknn.py -m ./model -r 384x384
63
+ ```
64
+
65
+ Note that the higher the resolution, the larger the model and the longer the conversion time. It's not recommended to use very high resolutions.
66
+
67
+ ## Known Issues
68
+
69
+ 1. ~~As of now, models converted using the latest version of rknn-toolkit2 (version 2.2.0) still suffer from severe precision loss, even when using fp16 data type. As shown in the image, the top is the result of inference using the ONNX model, and the bottom is the result using the RKNN model. All parameters are the same. Moreover, the higher the resolution, the more severe the precision loss. This is a bug in rknn-toolkit2.~~ (Fixed in v2.3.0)
70
+
71
+ 2. Actually, the model conversion script can select multiple resolutions (e.g., "384x384,256x256"), but this causes the model conversion to fail. This is a bug in rknn-toolkit2.
72
+
73
+ ## References
74
+
75
+ - [TheyCallMeHex/LCM-Dreamshaper-V7-ONNX](https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX)
76
+ - [Optimum's LatentConsistencyPipeline](https://github.com/huggingface/optimum/blob/main/optimum/pipelines/diffusers/pipeline_latent_consistency.py)
77
+ - [happyme531/RK3588-stable-diffusion-GPU](https://github.com/happyme531/RK3588-stable-diffusion-GPU)
convert-onnx-to-rknn.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ from typing import List
5
+ from rknn.api import RKNN
6
+ from math import exp
7
+ from sys import exit
8
+ import argparse
9
+
10
+
11
+ def convert_pipeline_component(onnx_path: str, resolution_list: List[List[int]], target_platform: str = 'rk3588'):
12
+ print(f'Converting {onnx_path} to RKNN model')
13
+ print(f'with target platform {target_platform}')
14
+ print(f'with resolutions:')
15
+ for res in resolution_list:
16
+ print(f'- {res[0]}x{res[1]}')
17
+ use_dynamic_shape = False
18
+ if(len(resolution_list) > 1):
19
+ print("Warning: RKNN dynamic shape support is probably broken, may throw errors")
20
+ use_dynamic_shape = True
21
+
22
+ batch_size = 1
23
+ LATENT_RESIZE_FACTOR = 8
24
+ # build shape list
25
+ if "text_encoder" in onnx_path:
26
+ input_size_list = [[[1,77]]]
27
+ inputs=['input_ids']
28
+ use_dynamic_shape = False
29
+ elif "unet" in onnx_path:
30
+ # batch_size = 2 # for classifier free guidance # broken for rknn python api
31
+
32
+ input_size_list = []
33
+ for res in resolution_list:
34
+ input_size_list.append(
35
+ [[1,4, res[0]//LATENT_RESIZE_FACTOR, res[1]//LATENT_RESIZE_FACTOR],
36
+ [1],
37
+ [1, 77, 768],
38
+ [1, 256]]
39
+ )
40
+ inputs=['sample','timestep','encoder_hidden_states','timestep_cond']
41
+ elif "vae_decoder" in onnx_path:
42
+ input_size_list = []
43
+ for res in resolution_list:
44
+ input_size_list.append(
45
+ [[1,4, res[0]//LATENT_RESIZE_FACTOR, res[1]//LATENT_RESIZE_FACTOR]]
46
+ )
47
+ inputs=['latent_sample']
48
+ else:
49
+ print("Unknown component: ", onnx_path)
50
+ exit(1)
51
+
52
+ rknn = RKNN(verbose=True)
53
+
54
+ # pre-process config
55
+ print('--> Config model')
56
+ rknn.config(target_platform='rk3588', optimization_level=3, single_core_mode=True,
57
+ dynamic_input= input_size_list if use_dynamic_shape else None)
58
+ print('done')
59
+
60
+ # Load ONNX model
61
+ print('--> Loading model')
62
+ ret = rknn.load_onnx(model=onnx_path,
63
+ inputs=None if use_dynamic_shape else inputs,
64
+ input_size_list= None if use_dynamic_shape else input_size_list[0])
65
+ if ret != 0:
66
+ print('Load model failed!')
67
+ exit(ret)
68
+ print('done')
69
+
70
+ # Build model
71
+ print('--> Building model')
72
+ ret = rknn.build(do_quantization=False, rknn_batch_size=batch_size)
73
+ if ret != 0:
74
+ print('Build model failed!')
75
+ exit(ret)
76
+ print('done')
77
+
78
+ #export
79
+ print('--> Export RKNN model')
80
+ ret = rknn.export_rknn(onnx_path.replace('.onnx', '.rknn'))
81
+ if ret != 0:
82
+ print('Export RKNN model failed!')
83
+ exit(ret)
84
+ print('done')
85
+
86
+ rknn.release()
87
+ print('RKNN model is converted successfully!')
88
+
89
+
90
+ def parse_resolution_list(resolution: str) -> List[List[int]]:
91
+ resolution_pairs = resolution.split(',')
92
+ parsed_resolutions = []
93
+ for pair in resolution_pairs:
94
+ width, height = map(int, pair.split('x'))
95
+ parsed_resolutions.append([width, height])
96
+
97
+ return parsed_resolutions
98
+
99
+
100
+ if __name__ == '__main__':
101
+ parser = argparse.ArgumentParser(description='Convert Stable Diffusion ONNX models to RKNN models')
102
+ parser.add_argument('-m','--model-dir', type=str, help='Directory containing the Stable Diffusion ONNX models', required=True)
103
+ parser.add_argument('-c','--components', type=str, help='Name of the components to convert, e.g. "text_encoder,unet,vae_decoder"', default='text_encoder, unet, vae_decoder')
104
+ parser.add_argument('-r','--resolutions', type=str, help='Comma-separated list of resolutions for the model, e.g. "256x256,512x512"', default='256x256')
105
+ parser.add_argument('--target_platform', type=str, help='Target platform for the RKNN model, default is "rk3588"', default='rk3588')
106
+ args = parser.parse_args()
107
+
108
+ components = args.components.split(',')
109
+
110
+ for component in components:
111
+ onnx_path = f'{args.model_dir}/{component.strip()}/model.onnx'
112
+ resolution_list = parse_resolution_list(args.resolutions)
113
+ if(len(resolution_list) == 0):
114
+ print("Error: No resolutions specified")
115
+ exit(1)
116
+
117
+ convert_pipeline_component(onnx_path, resolution_list, args.target_platform)
118
+
119
+
120
+
lcm_server.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import json
4
+ import time
5
+ import queue
6
+ import threading
7
+ from dataclasses import dataclass
8
+ from concurrent.futures import Future
9
+ from typing import Optional, List, Tuple
10
+
11
+ import numpy as np
12
+ from fastapi import FastAPI, Response, HTTPException
13
+ from pydantic import BaseModel, Field
14
+
15
+ from diffusers import LCMScheduler
16
+ from transformers import CLIPTokenizer
17
+
18
+ from rknnlcm import RKNN2Model, RKNN2LatentConsistencyPipeline
19
+
20
+ # --- Your imports (as in your script) ---
21
+ # from your_pkg import RKNN2LatentConsistencyPipeline, RKNN2Model
22
+ # NOTE: keep these as-is in your project.
23
+
24
+
25
+ # -----------------------------
26
+ # Request schema (HTTP)
27
+ # -----------------------------
28
+ class GenerateRequest(BaseModel):
29
+ prompt: str
30
+ size: str = Field(default="512x512", pattern=r"^\d+x\d+$")
31
+ num_inference_steps: int = 4
32
+ guidance_scale: float = 1.0
33
+ seed: int = 1234
34
+
35
+
36
+ @dataclass
37
+ class ModelPaths:
38
+ root: str # args.i
39
+ @property
40
+ def scheduler_config(self) -> str:
41
+ return os.path.join(self.root, "scheduler/scheduler_config.json")
42
+ @property
43
+ def text_encoder(self) -> str:
44
+ return os.path.join(self.root, "text_encoder")
45
+ @property
46
+ def unet(self) -> str:
47
+ return os.path.join(self.root, "unet")
48
+ @property
49
+ def vae_decoder(self) -> str:
50
+ return os.path.join(self.root, "vae_decoder")
51
+
52
+
53
+ @dataclass
54
+ class Job:
55
+ req: GenerateRequest
56
+ fut: Future
57
+ submitted_at: float
58
+
59
+
60
+ # -----------------------------
61
+ # Pipeline Worker
62
+ # -----------------------------
63
+ class PipelineWorker:
64
+ """
65
+ Owns ONE pipeline instance. Run this in a dedicated thread.
66
+ """
67
+ def __init__(
68
+ self,
69
+ worker_id: int,
70
+ paths: ModelPaths,
71
+ scheduler: LCMScheduler,
72
+ tokenizer: CLIPTokenizer,
73
+ rknn_context_cfg: dict,
74
+ ):
75
+ self.worker_id = worker_id
76
+ self.paths = paths
77
+ self.scheduler = scheduler
78
+ self.tokenizer = tokenizer
79
+ self.rknn_context_cfg = rknn_context_cfg
80
+
81
+ self.pipe = None # built in init()
82
+ self._init_pipeline()
83
+
84
+ def _init_pipeline(self):
85
+ # IMPORTANT: Each worker gets its *own* RKNN runtime context.
86
+ # You must map rknn_context_cfg to however your RKNN2Model supports it.
87
+ #
88
+ # Examples you might support in RKNN2Model:
89
+ # RKNN2Model(path, core_mask=..., multi_context=True, device_id=..., ...)
90
+ # RKNN2Model(path, runtime_options={...})
91
+ #
92
+ # Here: we pass **rknn_context_cfg as a flexible hook.
93
+ self.pipe = RKNN2LatentConsistencyPipeline(
94
+ text_encoder=RKNN2Model(self.paths.text_encoder, **self.rknn_context_cfg),
95
+ unet=RKNN2Model(self.paths.unet, **self.rknn_context_cfg),
96
+ vae_decoder=RKNN2Model(self.paths.vae_decoder, **self.rknn_context_cfg),
97
+ scheduler=self.scheduler,
98
+ tokenizer=self.tokenizer,
99
+ )
100
+
101
+ def run_job(self, job: Job) -> bytes:
102
+ h, w = (int(x) for x in job.req.size.split("x"))
103
+
104
+ # Deterministic per-request random generator
105
+ rng = np.random.RandomState(job.req.seed)
106
+
107
+ result = self.pipe(
108
+ prompt=job.req.prompt,
109
+ height=h,
110
+ width=w,
111
+ num_inference_steps=job.req.num_inference_steps,
112
+ guidance_scale=job.req.guidance_scale,
113
+ generator=rng,
114
+ )
115
+
116
+ pil_image = result["images"][0]
117
+ buf = io.BytesIO()
118
+ pil_image.save(buf, format="PNG")
119
+ return buf.getvalue()
120
+
121
+
122
+ # -----------------------------
123
+ # Singleton Service
124
+ # -----------------------------
125
+ class PipelineService:
126
+ """
127
+ Singleton-ish service that:
128
+ - loads scheduler/tokenizer once
129
+ - starts N worker threads
130
+ - provides a queued submit() API
131
+ """
132
+ _instance = None
133
+ _instance_lock = threading.Lock()
134
+
135
+ def __init__(
136
+ self,
137
+ paths: ModelPaths,
138
+ num_workers: int = 3,
139
+ queue_max: int = 64,
140
+ rknn_context_cfgs: Optional[List[dict]] = None,
141
+ ):
142
+ self.paths = paths
143
+ self.num_workers = num_workers
144
+ self.q: queue.Queue[Job] = queue.Queue(maxsize=queue_max)
145
+
146
+ # Load once (shared immutable objects)
147
+ with open(self.paths.scheduler_config, "r") as f:
148
+ scheduler_config = json.load(f)
149
+ self.scheduler = LCMScheduler.from_config(scheduler_config)
150
+ self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
151
+
152
+ # Build per-worker RKNN context configs
153
+ # If not provided, create N identical configs with multi_context enabled.
154
+ if rknn_context_cfgs is None:
155
+ rknn_context_cfgs = [{"multi_context": True, "worker_id": i} for i in range(num_workers)]
156
+ if len(rknn_context_cfgs) != num_workers:
157
+ raise ValueError("rknn_context_cfgs must match num_workers length")
158
+
159
+ self.workers: List[PipelineWorker] = []
160
+ self.threads: List[threading.Thread] = []
161
+ self._stop = threading.Event()
162
+
163
+ # Create worker-owned pipelines
164
+ for i in range(num_workers):
165
+ worker = PipelineWorker(
166
+ worker_id=i,
167
+ paths=self.paths,
168
+ scheduler=self.scheduler,
169
+ tokenizer=self.tokenizer,
170
+ rknn_context_cfg=rknn_context_cfgs[i],
171
+ )
172
+ self.workers.append(worker)
173
+
174
+ # Start threads
175
+ for i in range(num_workers):
176
+ t = threading.Thread(target=self._worker_loop, args=(i,), daemon=True)
177
+ t.start()
178
+ self.threads.append(t)
179
+
180
+ @classmethod
181
+ def get_instance(
182
+ cls,
183
+ paths: ModelPaths,
184
+ num_workers: int = 3,
185
+ queue_max: int = 64,
186
+ rknn_context_cfgs: Optional[List[dict]] = None,
187
+ ) -> "PipelineService":
188
+ with cls._instance_lock:
189
+ if cls._instance is None:
190
+ cls._instance = cls(
191
+ paths=paths,
192
+ num_workers=num_workers,
193
+ queue_max=queue_max,
194
+ rknn_context_cfgs=rknn_context_cfgs,
195
+ )
196
+ return cls._instance
197
+
198
+ def shutdown(self):
199
+ self._stop.set()
200
+ # Optionally drain queue with errors
201
+ while True:
202
+ try:
203
+ job = self.q.get_nowait()
204
+ except queue.Empty:
205
+ break
206
+ if not job.fut.done():
207
+ job.fut.set_exception(RuntimeError("Service shutting down"))
208
+ self.q.task_done()
209
+
210
+ def submit(self, req: GenerateRequest, timeout_s: float = 0.5) -> Future:
211
+ fut: Future = Future()
212
+ job = Job(req=req, fut=fut, submitted_at=time.time())
213
+
214
+ try:
215
+ self.q.put(job, timeout=timeout_s)
216
+ except queue.Full:
217
+ fut.set_exception(RuntimeError("Queue full"))
218
+ return fut
219
+
220
+ def _worker_loop(self, worker_idx: int):
221
+ worker = self.workers[worker_idx]
222
+ while not self._stop.is_set():
223
+ try:
224
+ job = self.q.get(timeout=0.1)
225
+ except queue.Empty:
226
+ continue
227
+
228
+ if job.fut.cancelled():
229
+ self.q.task_done()
230
+ continue
231
+
232
+ try:
233
+ png = worker.run_job(job)
234
+ if not job.fut.done():
235
+ job.fut.set_result(png)
236
+ except Exception as e:
237
+ if not job.fut.done():
238
+ job.fut.set_exception(e)
239
+ finally:
240
+ self.q.task_done()
241
+
242
+
243
+ # -----------------------------
244
+ # RKNN multi-context configuration
245
+ # -----------------------------
246
+ def build_rknn_context_cfgs_for_rk3588(num_workers: int) -> List[dict]:
247
+ """
248
+ Plug this into your RKNN2Model wrapper.
249
+ Typical approach on RK3588:
250
+ - bind each worker to a different NPU core (0/1/2)
251
+ - enable multi_context so each model instance has its own runtime context
252
+
253
+ You must map these fields inside RKNN2Model.
254
+ """
255
+ core_masks = ["NPU_CORE_0", "NPU_CORE_1", "NPU_CORE_2"]
256
+ cfgs = []
257
+ for i in range(num_workers):
258
+ cfgs.append({
259
+ "multi_context": True,
260
+ '''"core_mask": core_masks[i % len(core_masks)],'''
261
+ "core_mask": "NPU_CORE_AUTO",
262
+ "context_name": f"w{i}",
263
+ "worker_id": i,
264
+ })
265
+ return cfgs
266
+
267
+
268
+ # -----------------------------
269
+ # FastAPI server
270
+ # -----------------------------
271
+ app = FastAPI()
272
+
273
+ # Configure these for your deployment
274
+ MODEL_ROOT = os.environ.get("MODEL_ROOT", "/models/lcm_rknn")
275
+ NUM_WORKERS = int(os.environ.get("NUM_WORKERS", "3"))
276
+ QUEUE_MAX = int(os.environ.get("QUEUE_MAX", "64"))
277
+
278
+ paths = ModelPaths(root=MODEL_ROOT)
279
+
280
+ # Create singleton service at import time (fastest first request).
281
+ service = PipelineService.get_instance(
282
+ paths=paths,
283
+ num_workers=NUM_WORKERS,
284
+ queue_max=QUEUE_MAX,
285
+ rknn_context_cfgs=build_rknn_context_cfgs_for_rk3588(NUM_WORKERS),
286
+ )
287
+
288
+
289
+ @app.post("/generate", responses={200: {"content": {"image/png": {}}}})
290
+ def generate(req: GenerateRequest):
291
+ fut = service.submit(req, timeout_s=0.25)
292
+
293
+ try:
294
+ png_bytes = fut.result(timeout=120) # you can tune this
295
+ except Exception as e:
296
+ msg = str(e)
297
+ if "Queue full" in msg:
298
+ raise HTTPException(status_code=429, detail="Too many requests (queue full). Try again.")
299
+ raise HTTPException(status_code=500, detail=f"Generation failed: {msg}")
300
+
301
+ return Response(
302
+ content=png_bytes,
303
+ media_type="image/png",
304
+ headers={
305
+ "Cache-Control": "no-store",
306
+ },
307
+ )
308
+
309
+ if __name__ == "__main__":
310
+ import uvicorn
311
+ uvicorn.run(
312
+ app,
313
+ host="0.0.0.0",
314
+ port=int(os.environ.get("PORT", "4200")),
315
+ log_config=None, # <-- key
316
+ )
model/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.onnx_data filter=lfs diff=lfs merge=lfs -text
model/Assets/Icon.png ADDED

Git LFS Details

  • SHA256: d3074fce55454e2bbbd65ecf5bafafefa31397f2e22dc4eb1139c6cbd614955e
  • Pointer size: 131 Bytes
  • Size of remote file: 233 kB
model/Assets/LCM-Dreamshaper-V7-ONNX.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Name": "Dreamshaper v7(LCM)",
3
+ "Description": "DreamShaper started as a model to have an alternative to MidJourney in the open source world.",
4
+ "Author": "TheyCallMeHex",
5
+ "Repository": "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX",
6
+ "ImageIcon": "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Icon.png",
7
+ "Status": "Active",
8
+ "PadTokenId": 49407,
9
+ "BlankTokenId": 49407,
10
+ "TokenizerLimit": 77,
11
+ "EmbeddingsLength": 768,
12
+ "ScaleFactor": 0.18215,
13
+ "PipelineType": "LatentConsistency",
14
+ "Diffusers": [
15
+ "TextToImage",
16
+ "ImageToImage",
17
+ "ImageInpaintLegacy"
18
+ ],
19
+ "ModelFiles": [
20
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/tokenizer/model.onnx",
21
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/unet/model.onnx",
22
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/unet/model.onnx_data",
23
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/text_encoder/model.onnx",
24
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/vae_decoder/model.onnx",
25
+ "https://huggingface.co/TheyCallMeHex/LCM-Dreamshaper-V7-ONNX/resolve/main/vae_encoder/model.onnx"
26
+ ],
27
+ "Images": [
28
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview1.png",
29
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview2.png",
30
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview3.png",
31
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview4.png",
32
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview5.png",
33
+ "https://raw.githubusercontent.com/saddam213/OnnxStack/master/Assets/Templates/LCM-Dreamshaper-V7/Preview6.png"
34
+ ]
35
+ }
model/Assets/OnnxStack - 640x320.png ADDED
model/Assets/Preview1.png ADDED

Git LFS Details

  • SHA256: 43bf8ceccf682da0c3f60f21ef077b4a0602ba3aaaa1479ce81c80566697e0cc
  • Pointer size: 131 Bytes
  • Size of remote file: 447 kB
model/Assets/Preview2.png ADDED

Git LFS Details

  • SHA256: d5b535b6f0b99aa90da0e6fcc5175443a12ae4a4a4d2d8d315583a78ad727476
  • Pointer size: 131 Bytes
  • Size of remote file: 552 kB
model/Assets/Preview3.png ADDED

Git LFS Details

  • SHA256: c613380065f8d084d9fa4f7d575e77acec2fac1df5af69ebcd44d242dc2e55e1
  • Pointer size: 131 Bytes
  • Size of remote file: 478 kB
model/Assets/Preview4.png ADDED

Git LFS Details

  • SHA256: 08516dcfade0743d2d48cd999ec63640ebed690234e291f00d807cd14d077a57
  • Pointer size: 131 Bytes
  • Size of remote file: 454 kB
model/Assets/Preview5.png ADDED

Git LFS Details

  • SHA256: eb4cc72a470871800e6c62e23ad482684e5e7da8ba5c08f94f7f3e3c815e318e
  • Pointer size: 131 Bytes
  • Size of remote file: 473 kB
model/Assets/Preview6.png ADDED

Git LFS Details

  • SHA256: cb061284279fbac14d7a981d8220947dfc24724d1c7fa2cce193b41a2049b905
  • Pointer size: 131 Bytes
  • Size of remote file: 416 kB
model/Assets/lcm_angel_30_7.5_2092464983.png ADDED

Git LFS Details

  • SHA256: 2e12085a17ab0b75f952b6c02e269c93b3e4821b1dd3188c19a6174adea87a1a
  • Pointer size: 131 Bytes
  • Size of remote file: 399 kB
model/Assets/lcm_car_30_7.5_2092464983.png ADDED

Git LFS Details

  • SHA256: 578c327c2753351ebd4b53beceda66b84376701207896b930035d21874cb2d57
  • Pointer size: 131 Bytes
  • Size of remote file: 430 kB
model/Assets/lcm_demon_30_7.5_2092464983.png ADDED

Git LFS Details

  • SHA256: 808dc78b49f2e7b5af169b97b733ef5507d2fa7e721716b950f7d612dc8ea739
  • Pointer size: 131 Bytes
  • Size of remote file: 424 kB
model/Assets/lcm_ninja_30_7.5_2092464983.png ADDED

Git LFS Details

  • SHA256: 17cfea8ce05edc99bbc1b572b2c6e8089f2a8a3249eceb1c32520e4b0e9a5293
  • Pointer size: 131 Bytes
  • Size of remote file: 360 kB
model/README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+
3
+ language:
4
+ - en
5
+ license: mit
6
+ tags:
7
+ - stable-diffusion
8
+ - stable-diffusion-diffusers
9
+ - text-to-image
10
+ - diffusers
11
+ inference: true
12
+ ---
13
+
14
+ <p align="center" width="100%">
15
+ <img width="80%" src="Assets/OnnxStack - 640x320.png">
16
+ </p>
17
+
18
+ ### OnnxStack
19
+ This model has been converted to ONNX and tested with OnnxStack
20
+
21
+ - [OnnxStack](https://github.com/saddam213/OnnxStack)
22
+
23
+ ### LCM Dreamshaper V7 Diffusion
24
+ This model was converted to ONNX from LCM Dreamshaper V7
25
+
26
+ - [LCM-Dreamshaper-V7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7)
27
+
28
+ ### Sample Images
29
+ *A demon*
30
+
31
+ <img src="Assets/lcm_demon_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
32
+
33
+ Seed: 207582124 GuidanceScale: 7.5 NumInferenceSteps: 30
34
+
35
+ __________________________
36
+ *An angel*
37
+
38
+ <img src="Assets/lcm_angel_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
39
+
40
+ Seed: 207582124 GuidanceScale: 7.5 NumInferenceSteps: 30
41
+
42
+ __________________________
43
+ *A ninja*
44
+
45
+ <img src="Assets/lcm_ninja_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
46
+
47
+ Seed: 207582124 GuidanceScale: 7.5 NumInferenceSteps: 30
48
+
49
+ __________________________
50
+ *a japanese dometic market sports car sitting in a showroom*
51
+
52
+ <img src="Assets/lcm_car_30_7.5_2092464983.png" width="256" alt="Image of browser inferencing on sample images."/>
53
+
54
+ Seed: 207582124 GuidanceScale: 7.5 NumInferenceSteps: 30
55
+
56
+ __________________________
model/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "feature_extractor_type": "CLIPFeatureExtractor",
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "CLIPImageProcessor",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 224
27
+ }
28
+ }
model/model_index.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "_name_or_path": "LCM_Dreamshaper_v7",
5
+ "feature_extractor": [
6
+ "transformers",
7
+ "CLIPImageProcessor"
8
+ ],
9
+ "requires_safety_checker": true,
10
+ "safety_checker": [
11
+ "stable_diffusion",
12
+ "StableDiffusionSafetyChecker"
13
+ ],
14
+ "scheduler": [
15
+ "diffusers",
16
+ "LCMScheduler"
17
+ ],
18
+ "text_encoder": [
19
+ "transformers",
20
+ "CLIPTextModel"
21
+ ],
22
+ "tokenizer": [
23
+ "transformers",
24
+ "CLIPTokenizer"
25
+ ],
26
+ "unet": [
27
+ "diffusers",
28
+ "UNet2DConditionModel"
29
+ ],
30
+ "vae": [
31
+ "diffusers",
32
+ "AutoencoderKL"
33
+ ]
34
+ }
model/scheduler/scheduler_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LCMScheduler",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "original_inference_steps": 50,
12
+ "prediction_type": "epsilon",
13
+ "rescale_betas_zero_snr": false,
14
+ "sample_max_value": 1.0,
15
+ "set_alpha_to_one": true,
16
+ "steps_offset": 1,
17
+ "thresholding": false,
18
+ "timestep_spacing": "leading",
19
+ "trained_betas": null
20
+ }
model/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "LCM_Dreamshaper_v7\\text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "quick_gelu",
11
+ "hidden_size": 768,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 768,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.34.1",
24
+ "vocab_size": 49408
25
+ }
model/text_encoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefe95eab6542e5fb7642b3f592489176836cc3fd49196b924a63760602c8c4a
3
+ size 492588002
model/text_encoder/model.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc35bda83249c243260c076efe73701c7aa278e6d693c67c5ec12c3019a0bd0
3
+ size 249820005
model/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52af50d264d702c351484aabf62c64abe61f59d6a6d2c508a3e797e23dc1e008
3
+ size 1683168
model/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
model/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "49406": {
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49407": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "additional_special_tokens": [],
22
+ "bos_token": "<|startoftext|>",
23
+ "clean_up_tokenization_spaces": true,
24
+ "do_lower_case": true,
25
+ "eos_token": "<|endoftext|>",
26
+ "errors": "replace",
27
+ "model_max_length": 77,
28
+ "pad_token": "<|endoftext|>",
29
+ "tokenizer_class": "CLIPTokenizer",
30
+ "unk_token": "<|endoftext|>"
31
+ }
model/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model/unet/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "_name_or_path": "LCM_Dreamshaper_v7\\unet",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": 8,
10
+ "attention_type": "default",
11
+ "block_out_channels": [
12
+ 320,
13
+ 640,
14
+ 1280,
15
+ 1280
16
+ ],
17
+ "center_input_sample": false,
18
+ "class_embed_type": null,
19
+ "class_embeddings_concat": false,
20
+ "conv_in_kernel": 3,
21
+ "conv_out_kernel": 3,
22
+ "cross_attention_dim": 768,
23
+ "cross_attention_norm": null,
24
+ "down_block_types": [
25
+ "CrossAttnDownBlock2D",
26
+ "CrossAttnDownBlock2D",
27
+ "CrossAttnDownBlock2D",
28
+ "DownBlock2D"
29
+ ],
30
+ "downsample_padding": 1,
31
+ "dropout": 0.0,
32
+ "dual_cross_attention": false,
33
+ "encoder_hid_dim": null,
34
+ "encoder_hid_dim_type": null,
35
+ "flip_sin_to_cos": true,
36
+ "freq_shift": 0,
37
+ "in_channels": 4,
38
+ "layers_per_block": 2,
39
+ "mid_block_only_cross_attention": null,
40
+ "mid_block_scale_factor": 1,
41
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
42
+ "norm_eps": 1e-05,
43
+ "norm_num_groups": 32,
44
+ "num_attention_heads": null,
45
+ "num_class_embeds": null,
46
+ "only_cross_attention": false,
47
+ "out_channels": 4,
48
+ "projection_class_embeddings_input_dim": null,
49
+ "resnet_out_scale_factor": 1.0,
50
+ "resnet_skip_time_act": false,
51
+ "resnet_time_scale_shift": "default",
52
+ "reverse_transformer_layers_per_block": null,
53
+ "sample_size": 96,
54
+ "time_cond_proj_dim": 256,
55
+ "time_embedding_act_fn": null,
56
+ "time_embedding_dim": null,
57
+ "time_embedding_type": "positional",
58
+ "timestep_post_act": null,
59
+ "transformer_layers_per_block": 1,
60
+ "up_block_types": [
61
+ "UpBlock2D",
62
+ "CrossAttnUpBlock2D",
63
+ "CrossAttnUpBlock2D",
64
+ "CrossAttnUpBlock2D"
65
+ ],
66
+ "upcast_attention": null,
67
+ "use_linear_projection": false
68
+ }
model/unet/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e9a08a3e5b943046bf90a513c492cf4c6e31e26229062af8eb4ad2ddf172b5
3
+ size 1948508
model/unet/model.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef99ccc336de0e79f247fcb3d1398b3f3d1a02796916b88a351d7a83f570a31a
3
+ size 3438411520
model/unet/model.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1f0272a526f993b04b888eaa0b8bd76875e6ccca302de0f8ae9587bd48de18a
3
+ size 1809543921
model/vae_decoder/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "_name_or_path": "LCM_Dreamshaper_v7\\vae",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "layers_per_block": 2,
22
+ "norm_num_groups": 32,
23
+ "out_channels": 3,
24
+ "sample_size": 768,
25
+ "scaling_factor": 0.18215,
26
+ "up_block_types": [
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D"
31
+ ]
32
+ }
model/vae_decoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec5298d7bfa592d492d36b42d17f794fcdb9175e2aac366956d40f3f38d13ca1
3
+ size 198078038
model/vae_decoder/model.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24d8f23bef086b9aac6393d2b56da5582b0ab4307ea6e1376c7aa7052288d6cd
3
+ size 295036220
model/vae_encoder/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.22.0.dev0",
4
+ "_name_or_path": "LCM_Dreamshaper_v7\\vae",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "layers_per_block": 2,
22
+ "norm_num_groups": 32,
23
+ "out_channels": 3,
24
+ "sample_size": 768,
25
+ "scaling_factor": 0.18215,
26
+ "up_block_types": [
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D"
31
+ ]
32
+ }
model/vae_encoder/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:268d4398021d7bc91e91c94e4835cc5ffa471155db1b722d0a43f6d1a4f822fd
3
+ size 136760154
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.6
3
+ pydantic==2.8.2
4
+ numpy
5
+ pillow
6
+ transformers
7
+ diffusers
8
+ rknn-toolkit-lite2==2.3.2
rknnlcm.py ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import time
4
+
5
+ import PIL
6
+ from diffusers import StableDiffusionPipeline
7
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
8
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
9
+ from diffusers.schedulers import (
10
+ LCMScheduler
11
+ )
12
+
13
+ import logging
14
+
15
+ logging.basicConfig()
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.INFO)
18
+
19
+ import numpy as np
20
+ import os
21
+
22
+ import torch # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
23
+ from transformers import CLIPFeatureExtractor, CLIPTokenizer
24
+ from typing import Callable, List, Optional, Union, Tuple
25
+ from PIL import Image
26
+
27
+ from rknnlite.api import RKNNLite
28
+
29
+ import os
30
+ import json
31
+ import time
32
+ from typing import List, Any, Optional, Union
33
+
34
+ import numpy as np
35
+ from rknnlite.api import RKNNLite
36
+
37
+ class RKNN2Model:
38
+ """Wrapper for running RKNPU2 (RKNNLite) models"""
39
+
40
+ def __init__(
41
+ self,
42
+ model_dir: str,
43
+ *,
44
+ core_mask: Optional[Union[str, int]] = None,
45
+ multi_context: bool = True,
46
+ data_format: str = "nchw",
47
+ verbose_shapes: bool = False,
48
+ runtime_kwargs: Optional[dict] = None,
49
+ **_ignored: Any,
50
+ ):
51
+ """
52
+ Params are designed to match the pipeline service pattern:
53
+ RKNN2Model(path, **rknn_context_cfg)
54
+
55
+ - core_mask: can be None (defaults), string ("NPU_CORE_0"/"NPU_CORE_1"/"NPU_CORE_2"/"NPU_CORE_AUTO"),
56
+ or RKNNLite constant/int if you pass it directly.
57
+ NOTE: you said multi-core causes kernel crash; leave default to AUTO.
58
+ - multi_context: kept for compatibility with pool patterns. This class is already per-instance.
59
+ - data_format: passed to inference (default "nchw")
60
+ - verbose_shapes: log/print input/output shapes (disable for server)
61
+ - runtime_kwargs: optional extra kwargs to pass into init_runtime(...)
62
+ - **_ignored: allows you to pass context_name/worker_id etc without breaking
63
+ """
64
+ self.model_dir = model_dir
65
+ self.data_format = data_format
66
+ self.verbose_shapes = verbose_shapes
67
+ self.multi_context = multi_context
68
+ self.runtime_kwargs = runtime_kwargs or {}
69
+
70
+ logger.info(f"Loading {model_dir}")
71
+ start = time.time()
72
+
73
+ cfg_path = os.path.join(model_dir, "config.json")
74
+ rknn_path = os.path.join(model_dir, "model.rknn")
75
+
76
+ if not (os.path.exists(model_dir) and os.path.exists(rknn_path)):
77
+ raise FileNotFoundError(f"Missing model dir or model.rknn: {model_dir}")
78
+
79
+ self.config = json.load(open(cfg_path, "r"))
80
+
81
+ self.rknnlite = RKNNLite()
82
+ self.rknnlite.load_rknn(rknn_path)
83
+
84
+ # Resolve core mask
85
+ resolved_core_mask = self._resolve_core_mask(core_mask)
86
+
87
+ # IMPORTANT: Use AUTO by default because you noted multi-core can crash.
88
+ # If you later confirm stability, pass core_mask="NPU_CORE_0"/"NPU_CORE_1"/"NPU_CORE_2" per worker.
89
+ self.rknnlite.init_runtime(core_mask=resolved_core_mask, **self.runtime_kwargs)
90
+
91
+ load_time = time.time() - start
92
+ logger.info(f"Done loading {model_dir}. Took {load_time:.1f} seconds.")
93
+
94
+ self.modelname = os.path.basename(model_dir.rstrip("/"))
95
+ self.inference_time = 0
96
+
97
+ def _resolve_core_mask(self, core_mask: Optional[Union[str, int]]) -> int:
98
+ if core_mask is None:
99
+ return RKNNLite.NPU_CORE_AUTO
100
+
101
+ # Allow passing RKNNLite constant directly
102
+ if isinstance(core_mask, int):
103
+ return core_mask
104
+
105
+ # Allow passing names
106
+ if isinstance(core_mask, str):
107
+ key = core_mask.strip().upper()
108
+ mapping = {
109
+ "NPU_CORE_AUTO": RKNNLite.NPU_CORE_AUTO,
110
+ "NPU_CORE_0": RKNNLite.NPU_CORE_0,
111
+ "NPU_CORE_1": RKNNLite.NPU_CORE_1,
112
+ "NPU_CORE_2": RKNNLite.NPU_CORE_2,
113
+ # Some people write these:
114
+ "AUTO": RKNNLite.NPU_CORE_AUTO,
115
+ "0": RKNNLite.NPU_CORE_0,
116
+ "1": RKNNLite.NPU_CORE_1,
117
+ "2": RKNNLite.NPU_CORE_2,
118
+ }
119
+ if key not in mapping:
120
+ raise ValueError(f"Unknown core_mask string: {core_mask!r}")
121
+ return mapping[key]
122
+
123
+ raise TypeError(f"core_mask must be None, int, or str; got {type(core_mask)}")
124
+
125
+ def __call__(self, **kwargs) -> List[np.ndarray]:
126
+ # Preserve order of kwargs values as given by caller
127
+ input_list = list(kwargs.values())
128
+
129
+ if self.verbose_shapes:
130
+ for i, arr in enumerate(input_list):
131
+ if isinstance(arr, np.ndarray):
132
+ logger.info(f"[{self.modelname}] input[{i}] shape={arr.shape} dtype={arr.dtype}")
133
+
134
+ results = self.rknnlite.inference(inputs=input_list, data_format=self.data_format)
135
+
136
+ if self.verbose_shapes:
137
+ for j, res in enumerate(results):
138
+ if isinstance(res, np.ndarray):
139
+ logger.info(f"[{self.modelname}] output[{j}] shape={res.shape} dtype={res.dtype}")
140
+
141
+ return results
142
+
143
+ class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
144
+
145
+ def __init__(
146
+ self,
147
+ text_encoder: RKNN2Model,
148
+ unet: RKNN2Model,
149
+ vae_decoder: RKNN2Model,
150
+ scheduler: LCMScheduler,
151
+ tokenizer: CLIPTokenizer,
152
+ force_zeros_for_empty_prompt: Optional[bool] = True,
153
+ feature_extractor: Optional[CLIPFeatureExtractor] = None,
154
+ text_encoder_2: Optional[RKNN2Model] = None,
155
+ tokenizer_2: Optional[CLIPTokenizer] = None
156
+ ):
157
+ super().__init__()
158
+
159
+ self.register_modules(
160
+ tokenizer=tokenizer,
161
+ scheduler=scheduler,
162
+ feature_extractor=feature_extractor,
163
+ )
164
+ self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
165
+ self.safety_checker = None
166
+
167
+ self.text_encoder = text_encoder
168
+ self.text_encoder_2 = text_encoder_2
169
+ self.tokenizer_2 = tokenizer_2
170
+ self.unet = unet
171
+ self.vae_decoder = vae_decoder
172
+
173
+ VAE_DECODER_UPSAMPLE_FACTOR = 8
174
+ self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
175
+
176
+ @staticmethod
177
+ def postprocess(
178
+ image: np.ndarray,
179
+ output_type: str = "pil",
180
+ do_denormalize: Optional[List[bool]] = None,
181
+ ):
182
+ def numpy_to_pil(images: np.ndarray):
183
+ """
184
+ Convert a numpy image or a batch of images to a PIL image.
185
+ """
186
+ if images.ndim == 3:
187
+ images = images[None, ...]
188
+ images = (images * 255).round().astype("uint8")
189
+ if images.shape[-1] == 1:
190
+ # special case for grayscale (single channel) images
191
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
192
+ else:
193
+ pil_images = [Image.fromarray(image) for image in images]
194
+
195
+ return pil_images
196
+
197
+ def denormalize(images: np.ndarray):
198
+ """
199
+ Denormalize an image array to [0,1].
200
+ """
201
+ return np.clip(images / 2 + 0.5, 0, 1)
202
+
203
+ if not isinstance(image, np.ndarray):
204
+ raise ValueError(
205
+ f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
206
+ )
207
+ if output_type not in ["latent", "np", "pil"]:
208
+ deprecation_message = (
209
+ f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
210
+ "`pil`, `np`, `pt`, `latent`"
211
+ )
212
+ logger.warning(deprecation_message)
213
+ output_type = "np"
214
+
215
+ if output_type == "latent":
216
+ return image
217
+
218
+ if do_denormalize is None:
219
+ raise ValueError("do_denormalize is required for postprocessing")
220
+
221
+ image = np.stack(
222
+ [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
223
+ )
224
+ image = image.transpose((0, 2, 3, 1))
225
+
226
+ if output_type == "pil":
227
+ image = numpy_to_pil(image)
228
+
229
+ return image
230
+
231
+ def _encode_prompt(
232
+ self,
233
+ prompt: Union[str, List[str]],
234
+ num_images_per_prompt: int,
235
+ do_classifier_free_guidance: bool,
236
+ negative_prompt: Optional[Union[str, list]],
237
+ prompt_embeds: Optional[np.ndarray] = None,
238
+ negative_prompt_embeds: Optional[np.ndarray] = None,
239
+ ):
240
+ r"""
241
+ Encodes the prompt into text encoder hidden states.
242
+
243
+ Args:
244
+ prompt (`Union[str, List[str]]`):
245
+ prompt to be encoded
246
+ num_images_per_prompt (`int`):
247
+ number of images that should be generated per prompt
248
+ do_classifier_free_guidance (`bool`):
249
+ whether to use classifier free guidance or not
250
+ negative_prompt (`Optional[Union[str, list]]`):
251
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
252
+ if `guidance_scale` is less than `1`).
253
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
254
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
255
+ provided, text embeddings will be generated from `prompt` input argument.
256
+ negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
257
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
258
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
259
+ argument.
260
+ """
261
+ if isinstance(prompt, str):
262
+ batch_size = 1
263
+ elif isinstance(prompt, list):
264
+ batch_size = len(prompt)
265
+ else:
266
+ batch_size = prompt_embeds.shape[0]
267
+
268
+ if prompt_embeds is None:
269
+ # get prompt text embeddings
270
+ text_inputs = self.tokenizer(
271
+ prompt,
272
+ padding="max_length",
273
+ max_length=self.tokenizer.model_max_length,
274
+ truncation=True,
275
+ return_tensors="np",
276
+ )
277
+ text_input_ids = text_inputs.input_ids
278
+ untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
279
+
280
+ if not np.array_equal(text_input_ids, untruncated_ids):
281
+ removed_text = self.tokenizer.batch_decode(
282
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
283
+ )
284
+ logger.warning(
285
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
286
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
287
+ )
288
+
289
+ prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
290
+
291
+ prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
292
+
293
+ # get unconditional embeddings for classifier free guidance
294
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
295
+ uncond_tokens: List[str]
296
+ if negative_prompt is None:
297
+ uncond_tokens = [""] * batch_size
298
+ elif type(prompt) is not type(negative_prompt):
299
+ raise TypeError(
300
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
301
+ f" {type(prompt)}."
302
+ )
303
+ elif isinstance(negative_prompt, str):
304
+ uncond_tokens = [negative_prompt] * batch_size
305
+ elif batch_size != len(negative_prompt):
306
+ raise ValueError(
307
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
308
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
309
+ " the batch size of `prompt`."
310
+ )
311
+ else:
312
+ uncond_tokens = negative_prompt
313
+
314
+ max_length = prompt_embeds.shape[1]
315
+ uncond_input = self.tokenizer(
316
+ uncond_tokens,
317
+ padding="max_length",
318
+ max_length=max_length,
319
+ truncation=True,
320
+ return_tensors="np",
321
+ )
322
+ negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
323
+
324
+ if do_classifier_free_guidance:
325
+ negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
326
+
327
+ # For classifier free guidance, we need to do two forward passes.
328
+ # Here we concatenate the unconditional and text embeddings into a single batch
329
+ # to avoid doing two forward passes
330
+ prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
331
+
332
+ return prompt_embeds
333
+
334
+ # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
335
+ def check_inputs(
336
+ self,
337
+ prompt: Union[str, List[str]],
338
+ height: Optional[int],
339
+ width: Optional[int],
340
+ callback_steps: int,
341
+ negative_prompt: Optional[str] = None,
342
+ prompt_embeds: Optional[np.ndarray] = None,
343
+ negative_prompt_embeds: Optional[np.ndarray] = None,
344
+ ):
345
+ if height % 8 != 0 or width % 8 != 0:
346
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
347
+
348
+ if (callback_steps is None) or (
349
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
350
+ ):
351
+ raise ValueError(
352
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
353
+ f" {type(callback_steps)}."
354
+ )
355
+
356
+ if prompt is not None and prompt_embeds is not None:
357
+ raise ValueError(
358
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
359
+ " only forward one of the two."
360
+ )
361
+ elif prompt is None and prompt_embeds is None:
362
+ raise ValueError(
363
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
364
+ )
365
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
366
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
367
+
368
+ if negative_prompt is not None and negative_prompt_embeds is not None:
369
+ raise ValueError(
370
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
371
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
372
+ )
373
+
374
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
375
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
376
+ raise ValueError(
377
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
378
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
379
+ f" {negative_prompt_embeds.shape}."
380
+ )
381
+
382
+ # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
383
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
384
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
385
+ if isinstance(generator, list) and len(generator) != batch_size:
386
+ raise ValueError(
387
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
388
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
389
+ )
390
+
391
+ if latents is None:
392
+ if isinstance(generator, np.random.RandomState):
393
+ latents = generator.randn(*shape).astype(dtype)
394
+ elif isinstance(generator, torch.Generator):
395
+ latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
396
+ else:
397
+ raise ValueError(
398
+ f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
399
+ f" {type(generator)}."
400
+ )
401
+ elif latents.shape != shape:
402
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
403
+
404
+ # scale the initial noise by the standard deviation required by the scheduler
405
+ latents = latents * np.float64(self.scheduler.init_noise_sigma)
406
+
407
+ return latents
408
+
409
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
410
+ def __call__(
411
+ self,
412
+ prompt: Union[str, List[str]] = "",
413
+ height: Optional[int] = None,
414
+ width: Optional[int] = None,
415
+ num_inference_steps: int = 4,
416
+ original_inference_steps: int = None,
417
+ guidance_scale: float = 8.5,
418
+ num_images_per_prompt: int = 1,
419
+ generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
420
+ latents: Optional[np.ndarray] = None,
421
+ prompt_embeds: Optional[np.ndarray] = None,
422
+ output_type: str = "pil",
423
+ return_dict: bool = True,
424
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
425
+ callback_steps: int = 1,
426
+ ):
427
+ r"""
428
+ Function invoked when calling the pipeline for generation.
429
+
430
+ Args:
431
+ prompt (`Optional[Union[str, List[str]]]`, defaults to None):
432
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
433
+ instead.
434
+ height (`Optional[int]`, defaults to None):
435
+ The height in pixels of the generated image.
436
+ width (`Optional[int]`, defaults to None):
437
+ The width in pixels of the generated image.
438
+ num_inference_steps (`int`, defaults to 50):
439
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
440
+ expense of slower inference.
441
+ guidance_scale (`float`, defaults to 7.5):
442
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
443
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
444
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
445
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
446
+ usually at the expense of lower image quality.
447
+ num_images_per_prompt (`int`, defaults to 1):
448
+ The number of images to generate per prompt.
449
+ generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
450
+ A np.random.RandomState to make generation deterministic.
451
+ latents (`Optional[np.ndarray]`, defaults to `None`):
452
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
453
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
454
+ tensor will ge generated by sampling using the supplied random `generator`.
455
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
456
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
457
+ provided, text embeddings will be generated from `prompt` input argument.
458
+ output_type (`str`, defaults to `"pil"`):
459
+ The output format of the generate image. Choose between
460
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
461
+ return_dict (`bool`, defaults to `True`):
462
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
463
+ plain tuple.
464
+ callback (Optional[Callable], defaults to `None`):
465
+ A function that will be called every `callback_steps` steps during inference. The function will be
466
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
467
+ callback_steps (`int`, defaults to 1):
468
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
469
+ called at every step.
470
+ guidance_rescale (`float`, defaults to 0.0):
471
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
472
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
473
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
474
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
475
+
476
+ Returns:
477
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
478
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
479
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
480
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
481
+ (nsfw) content, according to the `safety_checker`.
482
+ """
483
+ height = height or self.unet.config["sample_size"] * self.vae_scale_factor
484
+ width = width or self.unet.config["sample_size"] * self.vae_scale_factor
485
+
486
+ # Don't need to get negative prompts due to LCM guided distillation
487
+ negative_prompt = None
488
+ negative_prompt_embeds = None
489
+
490
+ # check inputs. Raise error if not correct
491
+ self.check_inputs(
492
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
493
+ )
494
+
495
+ # define call parameters
496
+ if isinstance(prompt, str):
497
+ batch_size = 1
498
+ elif isinstance(prompt, list):
499
+ batch_size = len(prompt)
500
+ else:
501
+ batch_size = prompt_embeds.shape[0]
502
+
503
+ if generator is None:
504
+ generator = np.random.RandomState()
505
+
506
+ start_time = time.time()
507
+ prompt_embeds = self._encode_prompt(
508
+ prompt,
509
+ num_images_per_prompt,
510
+ False,
511
+ negative_prompt,
512
+ prompt_embeds=prompt_embeds,
513
+ negative_prompt_embeds=negative_prompt_embeds,
514
+ )
515
+ encode_prompt_time = time.time() - start_time
516
+ print(f"Prompt encoding time: {encode_prompt_time:.2f}s")
517
+
518
+ # set timesteps
519
+ self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
520
+ timesteps = self.scheduler.timesteps
521
+
522
+ latents = self.prepare_latents(
523
+ batch_size * num_images_per_prompt,
524
+ self.unet.config["in_channels"],
525
+ height,
526
+ width,
527
+ prompt_embeds.dtype,
528
+ generator,
529
+ latents,
530
+ )
531
+
532
+ bs = batch_size * num_images_per_prompt
533
+ # get Guidance Scale Embedding
534
+ w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
535
+ w_embedding = self.get_guidance_scale_embedding(
536
+ w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
537
+ )
538
+
539
+ # Adapted from diffusers to extend it for other runtimes than ORT
540
+ timestep_dtype = np.int64
541
+
542
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
543
+ inference_start = time.time()
544
+ for i, t in enumerate(self.progress_bar(timesteps)):
545
+ timestep = np.array([t], dtype=timestep_dtype)
546
+ noise_pred = self.unet(
547
+ sample=latents,
548
+ timestep=timestep,
549
+ encoder_hidden_states=prompt_embeds,
550
+ timestep_cond=w_embedding,
551
+ )[0]
552
+
553
+ # compute the previous noisy sample x_t -> x_t-1
554
+ latents, denoised = self.scheduler.step(
555
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
556
+ )
557
+ latents, denoised = latents.numpy(), denoised.numpy()
558
+
559
+ # call the callback, if provided
560
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
561
+ if callback is not None and i % callback_steps == 0:
562
+ callback(i, t, latents)
563
+ inference_time = time.time() - inference_start
564
+ print(f"Inference time: {inference_time:.2f}s")
565
+
566
+ decode_start = time.time()
567
+ if output_type == "latent":
568
+ image = denoised
569
+ has_nsfw_concept = None
570
+ else:
571
+ denoised /= self.vae_decoder.config["scaling_factor"]
572
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
573
+ image = np.concatenate(
574
+ [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
575
+ )
576
+ # image, has_nsfw_concept = self.run_safety_checker(image)
577
+ has_nsfw_concept = None # skip safety checker
578
+
579
+ if has_nsfw_concept is None:
580
+ do_denormalize = [True] * image.shape[0]
581
+ else:
582
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
583
+
584
+ image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
585
+ decode_time = time.time() - decode_start
586
+ print(f"Decode time: {decode_time:.2f}s")
587
+
588
+ total_time = encode_prompt_time + inference_time + decode_time
589
+ print(f"Total time: {total_time:.2f}s")
590
+
591
+ if not return_dict:
592
+ return (image, has_nsfw_concept)
593
+
594
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
595
+
596
+
597
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
598
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
599
+ """
600
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
601
+
602
+ Args:
603
+ timesteps (`torch.Tensor`):
604
+ generate embedding vectors at these timesteps
605
+ embedding_dim (`int`, *optional*, defaults to 512):
606
+ dimension of the embeddings to generate
607
+ dtype:
608
+ data type of the generated embeddings
609
+
610
+ Returns:
611
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
612
+ """
613
+ w = w * 1000
614
+ half_dim = embedding_dim // 2
615
+ emb = np.log(10000.0) / (half_dim - 1)
616
+ emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
617
+ emb = w[:, None] * emb[None, :]
618
+ emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
619
+
620
+ if embedding_dim % 2 == 1: # zero pad
621
+ emb = np.pad(emb, [(0, 0), (0, 1)])
622
+
623
+ assert emb.shape == (w.shape[0], embedding_dim)
624
+ return emb
625
+
626
+ def get_image_path(args, **override_kwargs):
627
+ """ mkdir output folder and encode metadata in the filename
628
+ """
629
+ out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
630
+ os.makedirs(out_folder, exist_ok=True)
631
+
632
+ out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
633
+
634
+ out_fname += f"_LCM_"
635
+ out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
636
+
637
+ return os.path.join(out_folder, out_fname + ".png")
638
+
639
+
640
+ def prepare_controlnet_cond(image_path, height, width):
641
+ image = Image.open(image_path).convert("RGB")
642
+ image = image.resize((height, width), resample=Image.LANCZOS)
643
+ image = np.array(image).transpose(2, 0, 1) / 255.0
644
+ return image
645
+
646
+ #args.prompt seed=4234924 i=model_path o=output_path size=256x256 num_inference_steps guidance_scale
647
+ def generate_png_bytes(args):
648
+ logger.info(f"Setting random seed to {args.seed}")
649
+
650
+ scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
651
+ with open(scheduler_config_path, "r") as f:
652
+ scheduler_config = json.load(f)
653
+
654
+ user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
655
+
656
+ pipe = RKNN2LatentConsistencyPipeline(
657
+ text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
658
+ unet=RKNN2Model(os.path.join(args.i, "unet")),
659
+ vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
660
+ scheduler=user_specified_scheduler,
661
+ tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
662
+ )
663
+
664
+ logger.info("Beginning image generation.")
665
+
666
+ result = pipe(
667
+ prompt=args.prompt,
668
+ height=int(args.size.split("x")[0]),
669
+ width=int(args.size.split("x")[1]),
670
+ num_inference_steps=args.num_inference_steps,
671
+ guidance_scale=args.guidance_scale,
672
+ generator=np.random.RandomState(args.seed),
673
+ )
674
+
675
+ pil_image = result["images"][0]
676
+
677
+ # Convert to PNG bytes
678
+ buf = io.BytesIO()
679
+ pil_image.save(buf, format="PNG")
680
+ buf.seek(0)
681
+
682
+ return buf.getvalue()
run_onnx-lcm.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ import json
4
+ import time
5
+
6
+ import PIL
7
+ from diffusers import StableDiffusionPipeline
8
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
9
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
10
+ from diffusers.schedulers import (
11
+ LCMScheduler
12
+ )
13
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
14
+
15
+ import gc
16
+ import inspect
17
+
18
+ import logging
19
+
20
+ logging.basicConfig()
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
23
+
24
+ import numpy as np
25
+ import os
26
+
27
+ import torch # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
28
+ from transformers import CLIPFeatureExtractor, CLIPTokenizer
29
+ from typing import Callable, List, Optional, Union, Tuple
30
+ from PIL import Image
31
+
32
+ # from rknnlite.api import RKNNLite
33
+
34
+ # class RKNN2Model:
35
+ # """ Wrapper for running RKNPU2 models """
36
+
37
+ # def __init__(self, model_path):
38
+
39
+ # logger.info(f"Loading {model_path}")
40
+
41
+ # start = time.time()
42
+ # assert os.path.exists(model_path) and model_path.endswith(".rknn")
43
+ # self.rknnlite = RKNNLite()
44
+ # self.rknnlite.load_rknn(model_path)
45
+ # self.rknnlite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO) # Multi-core will cause kernel crash
46
+ # load_time = time.time() - start
47
+ # logger.info(f"Done. Took {load_time:.1f} seconds.")
48
+ # self.modelname = model_path.split("/")[-1]
49
+ # self.inference_time = 0
50
+
51
+ # def __call__(self, **kwargs) -> List[np.ndarray]:
52
+ # np.savez(f"{self.modelname}_input_{self.inference_time}.npz", **kwargs)
53
+ # #print(kwargs)
54
+ # input_list = [value for key, value in kwargs.items()]
55
+ # for i, input in enumerate(input_list):
56
+ # if isinstance(input, np.ndarray):
57
+ # print(f"input {i} shape: {input.shape}")
58
+ # results = self.rknnlite.inference(inputs=input_list)
59
+ # for res in results:
60
+ # print(f"output shape: {res.shape}")
61
+ # return results
62
+
63
+ import onnxruntime as ort
64
+
65
+ class RKNN2Model:
66
+ """ Wrapper for running ONNX models """
67
+
68
+ def __init__(self, model_dir):
69
+ logger.info(f"Loading {model_dir}")
70
+ start = time.time()
71
+ self.config = json.load(open(os.path.join(model_dir, "config.json")))
72
+ assert os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "model.onnx"))
73
+ self.session = ort.InferenceSession(os.path.join(model_dir, "model.onnx"))
74
+ load_time = time.time() - start
75
+ logger.info(f"Done. Took {load_time:.1f} seconds.")
76
+ self.modelname = model_dir.split("/")[-1]
77
+ self.inference_time = 0
78
+
79
+ def __call__(self, **kwargs) -> List[np.ndarray]:
80
+ # np.savez(f"onnx_out/{self.modelname}_input_{self.inference_time}.npz", **kwargs)
81
+ self.inference_time += 1
82
+ results = self.session.run(None, kwargs)
83
+ results_list = []
84
+ for res in results:
85
+ results_list.append(res)
86
+ return results
87
+
88
+ class RKNN2StableDiffusionPipeline(DiffusionPipeline):
89
+ """ RKNN2 version of
90
+ `diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline`
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ text_encoder: RKNN2Model,
96
+ unet: RKNN2Model,
97
+ vae_decoder: RKNN2Model,
98
+ scheduler: LCMScheduler,
99
+ tokenizer: CLIPTokenizer,
100
+ force_zeros_for_empty_prompt: Optional[bool] = True,
101
+ feature_extractor: Optional[CLIPFeatureExtractor] = None,
102
+ text_encoder_2: Optional[RKNN2Model] = None,
103
+ tokenizer_2: Optional[CLIPTokenizer] = None
104
+
105
+ ):
106
+ super().__init__()
107
+
108
+ # Register non-Core ML components of the pipeline similar to the original pipeline
109
+ self.register_modules(
110
+ tokenizer=tokenizer,
111
+ scheduler=scheduler,
112
+ feature_extractor=feature_extractor,
113
+ )
114
+ self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
115
+ self.safety_checker = None
116
+
117
+ # Register Core ML components of the pipeline
118
+ self.text_encoder = text_encoder
119
+ self.text_encoder_2 = text_encoder_2
120
+ self.tokenizer_2 = tokenizer_2
121
+ self.unet = unet
122
+ self.vae_decoder = vae_decoder
123
+
124
+ VAE_DECODER_UPSAMPLE_FACTOR = 8
125
+
126
+ # In PyTorch, users can determine the tensor shapes dynamically by default
127
+ # In CoreML, tensors have static shapes unless flexible shapes were used during export
128
+ # See https://coremltools.readme.io/docs/flexible-inputs
129
+ latent_h, latent_w = 32, 32 # hallo1: FIXME: hardcoded value
130
+ self.height = latent_h * VAE_DECODER_UPSAMPLE_FACTOR
131
+ self.width = latent_w * VAE_DECODER_UPSAMPLE_FACTOR
132
+ self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
133
+ logger.info(
134
+ f"Stable Diffusion configured to generate {self.height}x{self.width} images"
135
+ )
136
+
137
+ @staticmethod
138
+ def postprocess(
139
+ image: np.ndarray,
140
+ output_type: str = "pil",
141
+ do_denormalize: Optional[List[bool]] = None,
142
+ ):
143
+ def numpy_to_pil(images: np.ndarray):
144
+ """
145
+ Convert a numpy image or a batch of images to a PIL image.
146
+ """
147
+ if images.ndim == 3:
148
+ images = images[None, ...]
149
+ images = (images * 255).round().astype("uint8")
150
+ if images.shape[-1] == 1:
151
+ # special case for grayscale (single channel) images
152
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
153
+ else:
154
+ pil_images = [Image.fromarray(image) for image in images]
155
+
156
+ return pil_images
157
+
158
+ def denormalize(images: np.ndarray):
159
+ """
160
+ Denormalize an image array to [0,1].
161
+ """
162
+ return np.clip(images / 2 + 0.5, 0, 1)
163
+
164
+ if not isinstance(image, np.ndarray):
165
+ raise ValueError(
166
+ f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
167
+ )
168
+ if output_type not in ["latent", "np", "pil"]:
169
+ deprecation_message = (
170
+ f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
171
+ "`pil`, `np`, `pt`, `latent`"
172
+ )
173
+ logger.warning(deprecation_message)
174
+ output_type = "np"
175
+
176
+ if output_type == "latent":
177
+ return image
178
+
179
+ if do_denormalize is None:
180
+ raise ValueError("do_denormalize is required for postprocessing")
181
+
182
+ image = np.stack(
183
+ [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
184
+ )
185
+ image = image.transpose((0, 2, 3, 1))
186
+
187
+ if output_type == "pil":
188
+ image = numpy_to_pil(image)
189
+
190
+ return image
191
+
192
+ def _encode_prompt(
193
+ self,
194
+ prompt: Union[str, List[str]],
195
+ num_images_per_prompt: int,
196
+ do_classifier_free_guidance: bool,
197
+ negative_prompt: Optional[Union[str, list]],
198
+ prompt_embeds: Optional[np.ndarray] = None,
199
+ negative_prompt_embeds: Optional[np.ndarray] = None,
200
+ ):
201
+ r"""
202
+ Encodes the prompt into text encoder hidden states.
203
+
204
+ Args:
205
+ prompt (`Union[str, List[str]]`):
206
+ prompt to be encoded
207
+ num_images_per_prompt (`int`):
208
+ number of images that should be generated per prompt
209
+ do_classifier_free_guidance (`bool`):
210
+ whether to use classifier free guidance or not
211
+ negative_prompt (`Optional[Union[str, list]]`):
212
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
213
+ if `guidance_scale` is less than `1`).
214
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
215
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
216
+ provided, text embeddings will be generated from `prompt` input argument.
217
+ negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
218
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
219
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
220
+ argument.
221
+ """
222
+ if isinstance(prompt, str):
223
+ batch_size = 1
224
+ elif isinstance(prompt, list):
225
+ batch_size = len(prompt)
226
+ else:
227
+ batch_size = prompt_embeds.shape[0]
228
+
229
+ if prompt_embeds is None:
230
+ # get prompt text embeddings
231
+ text_inputs = self.tokenizer(
232
+ prompt,
233
+ padding="max_length",
234
+ max_length=self.tokenizer.model_max_length,
235
+ truncation=True,
236
+ return_tensors="np",
237
+ )
238
+ text_input_ids = text_inputs.input_ids
239
+ untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
240
+
241
+ if not np.array_equal(text_input_ids, untruncated_ids):
242
+ removed_text = self.tokenizer.batch_decode(
243
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
244
+ )
245
+ logger.warning(
246
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
247
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
248
+ )
249
+
250
+ prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
251
+
252
+ prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
253
+
254
+ # get unconditional embeddings for classifier free guidance
255
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
256
+ uncond_tokens: List[str]
257
+ if negative_prompt is None:
258
+ uncond_tokens = [""] * batch_size
259
+ elif type(prompt) is not type(negative_prompt):
260
+ raise TypeError(
261
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
262
+ f" {type(prompt)}."
263
+ )
264
+ elif isinstance(negative_prompt, str):
265
+ uncond_tokens = [negative_prompt] * batch_size
266
+ elif batch_size != len(negative_prompt):
267
+ raise ValueError(
268
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
269
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
270
+ " the batch size of `prompt`."
271
+ )
272
+ else:
273
+ uncond_tokens = negative_prompt
274
+
275
+ max_length = prompt_embeds.shape[1]
276
+ uncond_input = self.tokenizer(
277
+ uncond_tokens,
278
+ padding="max_length",
279
+ max_length=max_length,
280
+ truncation=True,
281
+ return_tensors="np",
282
+ )
283
+ negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
284
+
285
+ if do_classifier_free_guidance:
286
+ negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
287
+
288
+ # For classifier free guidance, we need to do two forward passes.
289
+ # Here we concatenate the unconditional and text embeddings into a single batch
290
+ # to avoid doing two forward passes
291
+ prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
292
+
293
+ return prompt_embeds
294
+
295
+ # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
296
+ def check_inputs(
297
+ self,
298
+ prompt: Union[str, List[str]],
299
+ height: Optional[int],
300
+ width: Optional[int],
301
+ callback_steps: int,
302
+ negative_prompt: Optional[str] = None,
303
+ prompt_embeds: Optional[np.ndarray] = None,
304
+ negative_prompt_embeds: Optional[np.ndarray] = None,
305
+ ):
306
+ if height % 8 != 0 or width % 8 != 0:
307
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
308
+
309
+ if (callback_steps is None) or (
310
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
311
+ ):
312
+ raise ValueError(
313
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
314
+ f" {type(callback_steps)}."
315
+ )
316
+
317
+ if prompt is not None and prompt_embeds is not None:
318
+ raise ValueError(
319
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
320
+ " only forward one of the two."
321
+ )
322
+ elif prompt is None and prompt_embeds is None:
323
+ raise ValueError(
324
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
325
+ )
326
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
327
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
328
+
329
+ if negative_prompt is not None and negative_prompt_embeds is not None:
330
+ raise ValueError(
331
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
332
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
333
+ )
334
+
335
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
336
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
337
+ raise ValueError(
338
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
339
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
340
+ f" {negative_prompt_embeds.shape}."
341
+ )
342
+
343
+ # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
344
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
345
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
346
+ if isinstance(generator, list) and len(generator) != batch_size:
347
+ raise ValueError(
348
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
349
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
350
+ )
351
+
352
+ if latents is None:
353
+ if isinstance(generator, np.random.RandomState):
354
+ latents = generator.randn(*shape).astype(dtype)
355
+ elif isinstance(generator, torch.Generator):
356
+ latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
357
+ else:
358
+ raise ValueError(
359
+ f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
360
+ f" {type(generator)}."
361
+ )
362
+ elif latents.shape != shape:
363
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
364
+
365
+ # scale the initial noise by the standard deviation required by the scheduler
366
+ latents = latents * np.float64(self.scheduler.init_noise_sigma)
367
+
368
+ return latents
369
+
370
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
371
+ def __call__(
372
+ self,
373
+ prompt: Union[str, List[str]] = "",
374
+ height: Optional[int] = None,
375
+ width: Optional[int] = None,
376
+ num_inference_steps: int = 4,
377
+ original_inference_steps: int = None,
378
+ guidance_scale: float = 8.5,
379
+ num_images_per_prompt: int = 1,
380
+ generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
381
+ latents: Optional[np.ndarray] = None,
382
+ prompt_embeds: Optional[np.ndarray] = None,
383
+ output_type: str = "pil",
384
+ return_dict: bool = True,
385
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
386
+ callback_steps: int = 1,
387
+ ):
388
+ r"""
389
+ Function invoked when calling the pipeline for generation.
390
+
391
+ Args:
392
+ prompt (`Optional[Union[str, List[str]]]`, defaults to None):
393
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
394
+ instead.
395
+ height (`Optional[int]`, defaults to None):
396
+ The height in pixels of the generated image.
397
+ width (`Optional[int]`, defaults to None):
398
+ The width in pixels of the generated image.
399
+ num_inference_steps (`int`, defaults to 50):
400
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
401
+ expense of slower inference.
402
+ guidance_scale (`float`, defaults to 7.5):
403
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
404
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
405
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
406
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
407
+ usually at the expense of lower image quality.
408
+ num_images_per_prompt (`int`, defaults to 1):
409
+ The number of images to generate per prompt.
410
+ generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
411
+ A np.random.RandomState to make generation deterministic.
412
+ latents (`Optional[np.ndarray]`, defaults to `None`):
413
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
414
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
415
+ tensor will ge generated by sampling using the supplied random `generator`.
416
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
417
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
418
+ provided, text embeddings will be generated from `prompt` input argument.
419
+ output_type (`str`, defaults to `"pil"`):
420
+ The output format of the generate image. Choose between
421
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
422
+ return_dict (`bool`, defaults to `True`):
423
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
424
+ plain tuple.
425
+ callback (Optional[Callable], defaults to `None`):
426
+ A function that will be called every `callback_steps` steps during inference. The function will be
427
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
428
+ callback_steps (`int`, defaults to 1):
429
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
430
+ called at every step.
431
+ guidance_rescale (`float`, defaults to 0.0):
432
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
433
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
434
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
435
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
436
+
437
+ Returns:
438
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
439
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
440
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
441
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
442
+ (nsfw) content, according to the `safety_checker`.
443
+ """
444
+ height = height or self.unet.config["sample_size"] * self.vae_scale_factor
445
+ width = width or self.unet.config["sample_size"] * self.vae_scale_factor
446
+
447
+ # Don't need to get negative prompts due to LCM guided distillation
448
+ negative_prompt = None
449
+ negative_prompt_embeds = None
450
+
451
+ # check inputs. Raise error if not correct
452
+ self.check_inputs(
453
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
454
+ )
455
+
456
+ # define call parameters
457
+ if isinstance(prompt, str):
458
+ batch_size = 1
459
+ elif isinstance(prompt, list):
460
+ batch_size = len(prompt)
461
+ else:
462
+ batch_size = prompt_embeds.shape[0]
463
+
464
+ if generator is None:
465
+ generator = np.random.RandomState()
466
+
467
+ prompt_embeds = self._encode_prompt(
468
+ prompt,
469
+ num_images_per_prompt,
470
+ False,
471
+ negative_prompt,
472
+ prompt_embeds=prompt_embeds,
473
+ negative_prompt_embeds=negative_prompt_embeds,
474
+ )
475
+
476
+ # set timesteps
477
+ self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
478
+ timesteps = self.scheduler.timesteps
479
+
480
+ latents = self.prepare_latents(
481
+ batch_size * num_images_per_prompt,
482
+ self.unet.config["in_channels"],
483
+ height,
484
+ width,
485
+ prompt_embeds.dtype,
486
+ generator,
487
+ latents,
488
+ )
489
+
490
+ bs = batch_size * num_images_per_prompt
491
+ # get Guidance Scale Embedding
492
+ w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
493
+ w_embedding = self.get_guidance_scale_embedding(
494
+ w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
495
+ )
496
+
497
+ # Adapted from diffusers to extend it for other runtimes than ORT
498
+ timestep_dtype = np.int64
499
+
500
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
501
+ for i, t in enumerate(self.progress_bar(timesteps)):
502
+ timestep = np.array([t], dtype=timestep_dtype)
503
+ noise_pred = self.unet(
504
+ sample=latents,
505
+ timestep=timestep,
506
+ encoder_hidden_states=prompt_embeds,
507
+ timestep_cond=w_embedding,
508
+ )[0]
509
+
510
+ # compute the previous noisy sample x_t -> x_t-1
511
+ latents, denoised = self.scheduler.step(
512
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
513
+ )
514
+ latents, denoised = latents.numpy(), denoised.numpy()
515
+
516
+ # call the callback, if provided
517
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
518
+ if callback is not None and i % callback_steps == 0:
519
+ callback(i, t, latents)
520
+
521
+ if output_type == "latent":
522
+ image = denoised
523
+ has_nsfw_concept = None
524
+ else:
525
+ denoised /= self.vae_decoder.config["scaling_factor"]
526
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
527
+ image = np.concatenate(
528
+ [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
529
+ )
530
+ # image, has_nsfw_concept = self.run_safety_checker(image)
531
+ has_nsfw_concept = None # skip safety checker
532
+
533
+ if has_nsfw_concept is None:
534
+ do_denormalize = [True] * image.shape[0]
535
+ else:
536
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
537
+
538
+ image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
539
+
540
+ if not return_dict:
541
+ return (image, has_nsfw_concept)
542
+
543
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
544
+
545
+
546
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
547
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
548
+ """
549
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
550
+
551
+ Args:
552
+ timesteps (`torch.Tensor`):
553
+ generate embedding vectors at these timesteps
554
+ embedding_dim (`int`, *optional*, defaults to 512):
555
+ dimension of the embeddings to generate
556
+ dtype:
557
+ data type of the generated embeddings
558
+
559
+ Returns:
560
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
561
+ """
562
+ w = w * 1000
563
+ half_dim = embedding_dim // 2
564
+ emb = np.log(10000.0) / (half_dim - 1)
565
+ emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
566
+ emb = w[:, None] * emb[None, :]
567
+ emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
568
+
569
+ if embedding_dim % 2 == 1: # zero pad
570
+ emb = np.pad(emb, [(0, 0), (0, 1)])
571
+
572
+ assert emb.shape == (w.shape[0], embedding_dim)
573
+ return emb
574
+
575
+ def get_image_path(args, **override_kwargs):
576
+ """ mkdir output folder and encode metadata in the filename
577
+ """
578
+ out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
579
+ os.makedirs(out_folder, exist_ok=True)
580
+
581
+ out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
582
+
583
+ out_fname += f"_LCM_"
584
+ out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
585
+ out_fname += "_onnx_"
586
+
587
+ return os.path.join(out_folder, out_fname + ".png")
588
+
589
+
590
+ def prepare_controlnet_cond(image_path, height, width):
591
+ image = Image.open(image_path).convert("RGB")
592
+ image = image.resize((height, width), resample=Image.LANCZOS)
593
+ image = np.array(image).transpose(2, 0, 1) / 255.0
594
+ return image
595
+
596
+
597
+ def main(args):
598
+ logger.info(f"Setting random seed to {args.seed}")
599
+
600
+ # load scheduler from /scheduler/scheduler_config.json
601
+ scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
602
+ with open(scheduler_config_path, "r") as f:
603
+ scheduler_config = json.load(f)
604
+ user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
605
+
606
+ print("user_specified_scheduler", user_specified_scheduler)
607
+
608
+ pipe = RKNN2StableDiffusionPipeline(
609
+ text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
610
+ unet=RKNN2Model(os.path.join(args.i, "unet")),
611
+ vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
612
+ scheduler=user_specified_scheduler,
613
+ tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
614
+ )
615
+
616
+ logger.info("Beginning image generation.")
617
+ image = pipe(
618
+ prompt=args.prompt,
619
+ height=int(args.size.split("x")[0]),
620
+ width=int(args.size.split("x")[1]),
621
+ num_inference_steps=args.num_inference_steps,
622
+ guidance_scale=args.guidance_scale,
623
+ generator=np.random.RandomState(args.seed),
624
+ )
625
+
626
+ out_path = get_image_path(args)
627
+ logger.info(f"Saving generated image to {out_path}")
628
+ image["images"][0].save(out_path)
629
+
630
+
631
+ if __name__ == "__main__":
632
+ parser = argparse.ArgumentParser()
633
+
634
+ parser.add_argument(
635
+ "--prompt",
636
+ required=True,
637
+ help="The text prompt to be used for text-to-image generation.")
638
+ parser.add_argument(
639
+ "-i",
640
+ required=True,
641
+ help=("Path to model directory"))
642
+ parser.add_argument("-o", required=True)
643
+ parser.add_argument("--seed",
644
+ default=93,
645
+ type=int,
646
+ help="Random seed to be able to reproduce results")
647
+ parser.add_argument(
648
+ "-s",
649
+ "--size",
650
+ default="256x256",
651
+ type=str,
652
+ help="Image size")
653
+ parser.add_argument(
654
+ "--num-inference-steps",
655
+ default=4,
656
+ type=int,
657
+ help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
658
+ parser.add_argument(
659
+ "--guidance-scale",
660
+ default=7.5,
661
+ type=float,
662
+ help="Controls the influence of the text prompt on sampling process (0=random images)")
663
+
664
+ args = parser.parse_args()
665
+ main(args)
run_rknn-lcm.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ import json
4
+ import time
5
+
6
+ import PIL
7
+ from diffusers import StableDiffusionPipeline
8
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
9
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
10
+ from diffusers.schedulers import (
11
+ LCMScheduler
12
+ )
13
+
14
+ import logging
15
+
16
+ logging.basicConfig()
17
+ logger = logging.getLogger(__name__)
18
+ logger.setLevel(logging.INFO)
19
+
20
+ import numpy as np
21
+ import os
22
+
23
+ import torch # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
24
+ from transformers import CLIPFeatureExtractor, CLIPTokenizer
25
+ from typing import Callable, List, Optional, Union, Tuple
26
+ from PIL import Image
27
+
28
+ from rknnlite.api import RKNNLite
29
+
30
+ class RKNN2Model:
31
+ """ Wrapper for running RKNPU2 models """
32
+
33
+ def __init__(self, model_dir):
34
+ logger.info(f"Loading {model_dir}")
35
+ start = time.time()
36
+ self.config = json.load(open(os.path.join(model_dir, "config.json")))
37
+ assert os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "model.rknn"))
38
+ self.rknnlite = RKNNLite()
39
+ self.rknnlite.load_rknn(os.path.join(model_dir, "model.rknn"))
40
+ self.rknnlite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO) # Multi-core will cause kernel crash
41
+ load_time = time.time() - start
42
+ logger.info(f"Done. Took {load_time:.1f} seconds.")
43
+ self.modelname = model_dir.split("/")[-1]
44
+ self.inference_time = 0
45
+
46
+ def __call__(self, **kwargs) -> List[np.ndarray]:
47
+ # np.savez(f"rknn_out/{self.modelname}_input_{self.inference_time}.npz", **kwargs)
48
+ # self.inference_time += 1
49
+ #print(kwargs)
50
+ input_list = [value for key, value in kwargs.items()]
51
+ for i, input in enumerate(input_list):
52
+ if isinstance(input, np.ndarray):
53
+ print(f"input {i} shape: {input.shape}")
54
+
55
+ results = self.rknnlite.inference(inputs=input_list, data_format='nchw')
56
+ for res in results:
57
+ print(f"output shape: {res.shape}")
58
+ return results
59
+
60
+ class RKNN2LatentConsistencyPipeline(DiffusionPipeline):
61
+
62
+ def __init__(
63
+ self,
64
+ text_encoder: RKNN2Model,
65
+ unet: RKNN2Model,
66
+ vae_decoder: RKNN2Model,
67
+ scheduler: LCMScheduler,
68
+ tokenizer: CLIPTokenizer,
69
+ force_zeros_for_empty_prompt: Optional[bool] = True,
70
+ feature_extractor: Optional[CLIPFeatureExtractor] = None,
71
+ text_encoder_2: Optional[RKNN2Model] = None,
72
+ tokenizer_2: Optional[CLIPTokenizer] = None
73
+ ):
74
+ super().__init__()
75
+
76
+ self.register_modules(
77
+ tokenizer=tokenizer,
78
+ scheduler=scheduler,
79
+ feature_extractor=feature_extractor,
80
+ )
81
+ self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
82
+ self.safety_checker = None
83
+
84
+ self.text_encoder = text_encoder
85
+ self.text_encoder_2 = text_encoder_2
86
+ self.tokenizer_2 = tokenizer_2
87
+ self.unet = unet
88
+ self.vae_decoder = vae_decoder
89
+
90
+ VAE_DECODER_UPSAMPLE_FACTOR = 8
91
+ self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
92
+
93
+ @staticmethod
94
+ def postprocess(
95
+ image: np.ndarray,
96
+ output_type: str = "pil",
97
+ do_denormalize: Optional[List[bool]] = None,
98
+ ):
99
+ def numpy_to_pil(images: np.ndarray):
100
+ """
101
+ Convert a numpy image or a batch of images to a PIL image.
102
+ """
103
+ if images.ndim == 3:
104
+ images = images[None, ...]
105
+ images = (images * 255).round().astype("uint8")
106
+ if images.shape[-1] == 1:
107
+ # special case for grayscale (single channel) images
108
+ pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
109
+ else:
110
+ pil_images = [Image.fromarray(image) for image in images]
111
+
112
+ return pil_images
113
+
114
+ def denormalize(images: np.ndarray):
115
+ """
116
+ Denormalize an image array to [0,1].
117
+ """
118
+ return np.clip(images / 2 + 0.5, 0, 1)
119
+
120
+ if not isinstance(image, np.ndarray):
121
+ raise ValueError(
122
+ f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
123
+ )
124
+ if output_type not in ["latent", "np", "pil"]:
125
+ deprecation_message = (
126
+ f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
127
+ "`pil`, `np`, `pt`, `latent`"
128
+ )
129
+ logger.warning(deprecation_message)
130
+ output_type = "np"
131
+
132
+ if output_type == "latent":
133
+ return image
134
+
135
+ if do_denormalize is None:
136
+ raise ValueError("do_denormalize is required for postprocessing")
137
+
138
+ image = np.stack(
139
+ [denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
140
+ )
141
+ image = image.transpose((0, 2, 3, 1))
142
+
143
+ if output_type == "pil":
144
+ image = numpy_to_pil(image)
145
+
146
+ return image
147
+
148
+ def _encode_prompt(
149
+ self,
150
+ prompt: Union[str, List[str]],
151
+ num_images_per_prompt: int,
152
+ do_classifier_free_guidance: bool,
153
+ negative_prompt: Optional[Union[str, list]],
154
+ prompt_embeds: Optional[np.ndarray] = None,
155
+ negative_prompt_embeds: Optional[np.ndarray] = None,
156
+ ):
157
+ r"""
158
+ Encodes the prompt into text encoder hidden states.
159
+
160
+ Args:
161
+ prompt (`Union[str, List[str]]`):
162
+ prompt to be encoded
163
+ num_images_per_prompt (`int`):
164
+ number of images that should be generated per prompt
165
+ do_classifier_free_guidance (`bool`):
166
+ whether to use classifier free guidance or not
167
+ negative_prompt (`Optional[Union[str, list]]`):
168
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
169
+ if `guidance_scale` is less than `1`).
170
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
171
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
172
+ provided, text embeddings will be generated from `prompt` input argument.
173
+ negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
174
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
175
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
176
+ argument.
177
+ """
178
+ if isinstance(prompt, str):
179
+ batch_size = 1
180
+ elif isinstance(prompt, list):
181
+ batch_size = len(prompt)
182
+ else:
183
+ batch_size = prompt_embeds.shape[0]
184
+
185
+ if prompt_embeds is None:
186
+ # get prompt text embeddings
187
+ text_inputs = self.tokenizer(
188
+ prompt,
189
+ padding="max_length",
190
+ max_length=self.tokenizer.model_max_length,
191
+ truncation=True,
192
+ return_tensors="np",
193
+ )
194
+ text_input_ids = text_inputs.input_ids
195
+ untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
196
+
197
+ if not np.array_equal(text_input_ids, untruncated_ids):
198
+ removed_text = self.tokenizer.batch_decode(
199
+ untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
200
+ )
201
+ logger.warning(
202
+ "The following part of your input was truncated because CLIP can only handle sequences up to"
203
+ f" {self.tokenizer.model_max_length} tokens: {removed_text}"
204
+ )
205
+
206
+ prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
207
+
208
+ prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
209
+
210
+ # get unconditional embeddings for classifier free guidance
211
+ if do_classifier_free_guidance and negative_prompt_embeds is None:
212
+ uncond_tokens: List[str]
213
+ if negative_prompt is None:
214
+ uncond_tokens = [""] * batch_size
215
+ elif type(prompt) is not type(negative_prompt):
216
+ raise TypeError(
217
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
218
+ f" {type(prompt)}."
219
+ )
220
+ elif isinstance(negative_prompt, str):
221
+ uncond_tokens = [negative_prompt] * batch_size
222
+ elif batch_size != len(negative_prompt):
223
+ raise ValueError(
224
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
225
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
226
+ " the batch size of `prompt`."
227
+ )
228
+ else:
229
+ uncond_tokens = negative_prompt
230
+
231
+ max_length = prompt_embeds.shape[1]
232
+ uncond_input = self.tokenizer(
233
+ uncond_tokens,
234
+ padding="max_length",
235
+ max_length=max_length,
236
+ truncation=True,
237
+ return_tensors="np",
238
+ )
239
+ negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
240
+
241
+ if do_classifier_free_guidance:
242
+ negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
243
+
244
+ # For classifier free guidance, we need to do two forward passes.
245
+ # Here we concatenate the unconditional and text embeddings into a single batch
246
+ # to avoid doing two forward passes
247
+ prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
248
+
249
+ return prompt_embeds
250
+
251
+ # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
252
+ def check_inputs(
253
+ self,
254
+ prompt: Union[str, List[str]],
255
+ height: Optional[int],
256
+ width: Optional[int],
257
+ callback_steps: int,
258
+ negative_prompt: Optional[str] = None,
259
+ prompt_embeds: Optional[np.ndarray] = None,
260
+ negative_prompt_embeds: Optional[np.ndarray] = None,
261
+ ):
262
+ if height % 8 != 0 or width % 8 != 0:
263
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
264
+
265
+ if (callback_steps is None) or (
266
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
267
+ ):
268
+ raise ValueError(
269
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
270
+ f" {type(callback_steps)}."
271
+ )
272
+
273
+ if prompt is not None and prompt_embeds is not None:
274
+ raise ValueError(
275
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
276
+ " only forward one of the two."
277
+ )
278
+ elif prompt is None and prompt_embeds is None:
279
+ raise ValueError(
280
+ "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
281
+ )
282
+ elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
283
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
284
+
285
+ if negative_prompt is not None and negative_prompt_embeds is not None:
286
+ raise ValueError(
287
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
288
+ f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
289
+ )
290
+
291
+ if prompt_embeds is not None and negative_prompt_embeds is not None:
292
+ if prompt_embeds.shape != negative_prompt_embeds.shape:
293
+ raise ValueError(
294
+ "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
295
+ f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
296
+ f" {negative_prompt_embeds.shape}."
297
+ )
298
+
299
+ # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
300
+ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
301
+ shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
302
+ if isinstance(generator, list) and len(generator) != batch_size:
303
+ raise ValueError(
304
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
305
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
306
+ )
307
+
308
+ if latents is None:
309
+ if isinstance(generator, np.random.RandomState):
310
+ latents = generator.randn(*shape).astype(dtype)
311
+ elif isinstance(generator, torch.Generator):
312
+ latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
313
+ else:
314
+ raise ValueError(
315
+ f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
316
+ f" {type(generator)}."
317
+ )
318
+ elif latents.shape != shape:
319
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
320
+
321
+ # scale the initial noise by the standard deviation required by the scheduler
322
+ latents = latents * np.float64(self.scheduler.init_noise_sigma)
323
+
324
+ return latents
325
+
326
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
327
+ def __call__(
328
+ self,
329
+ prompt: Union[str, List[str]] = "",
330
+ height: Optional[int] = None,
331
+ width: Optional[int] = None,
332
+ num_inference_steps: int = 4,
333
+ original_inference_steps: int = None,
334
+ guidance_scale: float = 8.5,
335
+ num_images_per_prompt: int = 1,
336
+ generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
337
+ latents: Optional[np.ndarray] = None,
338
+ prompt_embeds: Optional[np.ndarray] = None,
339
+ output_type: str = "pil",
340
+ return_dict: bool = True,
341
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
342
+ callback_steps: int = 1,
343
+ ):
344
+ r"""
345
+ Function invoked when calling the pipeline for generation.
346
+
347
+ Args:
348
+ prompt (`Optional[Union[str, List[str]]]`, defaults to None):
349
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
350
+ instead.
351
+ height (`Optional[int]`, defaults to None):
352
+ The height in pixels of the generated image.
353
+ width (`Optional[int]`, defaults to None):
354
+ The width in pixels of the generated image.
355
+ num_inference_steps (`int`, defaults to 50):
356
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
357
+ expense of slower inference.
358
+ guidance_scale (`float`, defaults to 7.5):
359
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
360
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
361
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
362
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
363
+ usually at the expense of lower image quality.
364
+ num_images_per_prompt (`int`, defaults to 1):
365
+ The number of images to generate per prompt.
366
+ generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
367
+ A np.random.RandomState to make generation deterministic.
368
+ latents (`Optional[np.ndarray]`, defaults to `None`):
369
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
370
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
371
+ tensor will ge generated by sampling using the supplied random `generator`.
372
+ prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
373
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
374
+ provided, text embeddings will be generated from `prompt` input argument.
375
+ output_type (`str`, defaults to `"pil"`):
376
+ The output format of the generate image. Choose between
377
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
378
+ return_dict (`bool`, defaults to `True`):
379
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
380
+ plain tuple.
381
+ callback (Optional[Callable], defaults to `None`):
382
+ A function that will be called every `callback_steps` steps during inference. The function will be
383
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
384
+ callback_steps (`int`, defaults to 1):
385
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
386
+ called at every step.
387
+ guidance_rescale (`float`, defaults to 0.0):
388
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
389
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
390
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
391
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
392
+
393
+ Returns:
394
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
395
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
396
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
397
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
398
+ (nsfw) content, according to the `safety_checker`.
399
+ """
400
+ height = height or self.unet.config["sample_size"] * self.vae_scale_factor
401
+ width = width or self.unet.config["sample_size"] * self.vae_scale_factor
402
+
403
+ # Don't need to get negative prompts due to LCM guided distillation
404
+ negative_prompt = None
405
+ negative_prompt_embeds = None
406
+
407
+ # check inputs. Raise error if not correct
408
+ self.check_inputs(
409
+ prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
410
+ )
411
+
412
+ # define call parameters
413
+ if isinstance(prompt, str):
414
+ batch_size = 1
415
+ elif isinstance(prompt, list):
416
+ batch_size = len(prompt)
417
+ else:
418
+ batch_size = prompt_embeds.shape[0]
419
+
420
+ if generator is None:
421
+ generator = np.random.RandomState()
422
+
423
+ start_time = time.time()
424
+ prompt_embeds = self._encode_prompt(
425
+ prompt,
426
+ num_images_per_prompt,
427
+ False,
428
+ negative_prompt,
429
+ prompt_embeds=prompt_embeds,
430
+ negative_prompt_embeds=negative_prompt_embeds,
431
+ )
432
+ encode_prompt_time = time.time() - start_time
433
+ print(f"Prompt encoding time: {encode_prompt_time:.2f}s")
434
+
435
+ # set timesteps
436
+ self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
437
+ timesteps = self.scheduler.timesteps
438
+
439
+ latents = self.prepare_latents(
440
+ batch_size * num_images_per_prompt,
441
+ self.unet.config["in_channels"],
442
+ height,
443
+ width,
444
+ prompt_embeds.dtype,
445
+ generator,
446
+ latents,
447
+ )
448
+
449
+ bs = batch_size * num_images_per_prompt
450
+ # get Guidance Scale Embedding
451
+ w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
452
+ w_embedding = self.get_guidance_scale_embedding(
453
+ w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
454
+ )
455
+
456
+ # Adapted from diffusers to extend it for other runtimes than ORT
457
+ timestep_dtype = np.int64
458
+
459
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
460
+ inference_start = time.time()
461
+ for i, t in enumerate(self.progress_bar(timesteps)):
462
+ timestep = np.array([t], dtype=timestep_dtype)
463
+ noise_pred = self.unet(
464
+ sample=latents,
465
+ timestep=timestep,
466
+ encoder_hidden_states=prompt_embeds,
467
+ timestep_cond=w_embedding,
468
+ )[0]
469
+
470
+ # compute the previous noisy sample x_t -> x_t-1
471
+ latents, denoised = self.scheduler.step(
472
+ torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
473
+ )
474
+ latents, denoised = latents.numpy(), denoised.numpy()
475
+
476
+ # call the callback, if provided
477
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
478
+ if callback is not None and i % callback_steps == 0:
479
+ callback(i, t, latents)
480
+ inference_time = time.time() - inference_start
481
+ print(f"Inference time: {inference_time:.2f}s")
482
+
483
+ decode_start = time.time()
484
+ if output_type == "latent":
485
+ image = denoised
486
+ has_nsfw_concept = None
487
+ else:
488
+ denoised /= self.vae_decoder.config["scaling_factor"]
489
+ # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
490
+ image = np.concatenate(
491
+ [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
492
+ )
493
+ # image, has_nsfw_concept = self.run_safety_checker(image)
494
+ has_nsfw_concept = None # skip safety checker
495
+
496
+ if has_nsfw_concept is None:
497
+ do_denormalize = [True] * image.shape[0]
498
+ else:
499
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
500
+
501
+ image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
502
+ decode_time = time.time() - decode_start
503
+ print(f"Decode time: {decode_time:.2f}s")
504
+
505
+ total_time = encode_prompt_time + inference_time + decode_time
506
+ print(f"Total time: {total_time:.2f}s")
507
+
508
+ if not return_dict:
509
+ return (image, has_nsfw_concept)
510
+
511
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
512
+
513
+
514
+ # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
515
+ def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
516
+ """
517
+ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
518
+
519
+ Args:
520
+ timesteps (`torch.Tensor`):
521
+ generate embedding vectors at these timesteps
522
+ embedding_dim (`int`, *optional*, defaults to 512):
523
+ dimension of the embeddings to generate
524
+ dtype:
525
+ data type of the generated embeddings
526
+
527
+ Returns:
528
+ `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
529
+ """
530
+ w = w * 1000
531
+ half_dim = embedding_dim // 2
532
+ emb = np.log(10000.0) / (half_dim - 1)
533
+ emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
534
+ emb = w[:, None] * emb[None, :]
535
+ emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
536
+
537
+ if embedding_dim % 2 == 1: # zero pad
538
+ emb = np.pad(emb, [(0, 0), (0, 1)])
539
+
540
+ assert emb.shape == (w.shape[0], embedding_dim)
541
+ return emb
542
+
543
+ def get_image_path(args, **override_kwargs):
544
+ """ mkdir output folder and encode metadata in the filename
545
+ """
546
+ out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
547
+ os.makedirs(out_folder, exist_ok=True)
548
+
549
+ out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"
550
+
551
+ out_fname += f"_LCM_"
552
+ out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
553
+
554
+ return os.path.join(out_folder, out_fname + ".png")
555
+
556
+
557
+ def prepare_controlnet_cond(image_path, height, width):
558
+ image = Image.open(image_path).convert("RGB")
559
+ image = image.resize((height, width), resample=Image.LANCZOS)
560
+ image = np.array(image).transpose(2, 0, 1) / 255.0
561
+ return image
562
+
563
+
564
+ def main(args):
565
+ logger.info(f"Setting random seed to {args.seed}")
566
+
567
+ # load scheduler from /scheduler/scheduler_config.json
568
+ scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
569
+ with open(scheduler_config_path, "r") as f:
570
+ scheduler_config = json.load(f)
571
+ user_specified_scheduler = LCMScheduler.from_config(scheduler_config)
572
+
573
+ print("user_specified_scheduler", user_specified_scheduler)
574
+
575
+ pipe = RKNN2LatentConsistencyPipeline(
576
+ text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
577
+ unet=RKNN2Model(os.path.join(args.i, "unet")),
578
+ vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
579
+ scheduler=user_specified_scheduler,
580
+ tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
581
+ )
582
+
583
+ logger.info("Beginning image generation.")
584
+ image = pipe(
585
+ prompt=args.prompt,
586
+ height=int(args.size.split("x")[0]),
587
+ width=int(args.size.split("x")[1]),
588
+ num_inference_steps=args.num_inference_steps,
589
+ guidance_scale=args.guidance_scale,
590
+ generator=np.random.RandomState(args.seed),
591
+ )
592
+
593
+ out_path = get_image_path(args)
594
+ logger.info(f"Saving generated image to {out_path}")
595
+ image["images"][0].save(out_path)
596
+
597
+
598
+ if __name__ == "__main__":
599
+ parser = argparse.ArgumentParser()
600
+
601
+ parser.add_argument(
602
+ "--prompt",
603
+ required=True,
604
+ help="The text prompt to be used for text-to-image generation.")
605
+ parser.add_argument(
606
+ "-i",
607
+ required=True,
608
+ help=("Path to model directory"))
609
+ parser.add_argument("-o", required=True)
610
+ parser.add_argument("--seed",
611
+ default=93,
612
+ type=int,
613
+ help="Random seed to be able to reproduce results")
614
+ parser.add_argument(
615
+ "-s",
616
+ "--size",
617
+ default="256x256",
618
+ type=str,
619
+ help="Image size")
620
+ parser.add_argument(
621
+ "--num-inference-steps",
622
+ default=4,
623
+ type=int,
624
+ help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
625
+ parser.add_argument(
626
+ "--guidance-scale",
627
+ default=7.5,
628
+ type=float,
629
+ help="Controls the influence of the text prompt on sampling process (0=random images)")
630
+
631
+ args = parser.parse_args()
632
+ main(args)
runner.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ docker run --rm -it \
2
+ --name lcm-sd \
3
+ --network host \
4
+ --privileged \
5
+ -e PORT=4200 \
6
+ -e NUM_WORKERS=1 \
7
+ -e QUEUE_MAX=8 \
8
+ -e MODEL_ROOT=/models \
9
+ -v "$PWD/model:/models:ro" \
10
+ lcm-sd