yongqiang commited on
Commit ·
1e356cf
1
Parent(s): fb6a7b1
The text_encoder supports inference using onnx & axmodel
Browse files- .gitattributes +2 -0
- models/text_encoder/sd15_text_encoder_sim.axmodel +3 -0
- run_img2img_axe_infer.py +11 -6
- run_img2img_onnx_infer.py +15 -8
- run_txt2img_axe_infer.py +11 -6
- run_txt2img_axe_infer_new.py +10 -5
- run_txt2img_onnx_infer.py +14 -8
.gitattributes
CHANGED
|
@@ -43,3 +43,5 @@ models/unet.axmodel filter=lfs diff=lfs merge=lfs -text
|
|
| 43 |
models/vae_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 44 |
models/vae_encoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 45 |
models/7ffcf62c-d292-11ef-bb2a-9d527016cd35 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 43 |
models/vae_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 44 |
models/vae_encoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 45 |
models/7ffcf62c-d292-11ef-bb2a-9d527016cd35 filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
models/text_encoder/sd15_text_encoder_sim.onnx filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
models/text_encoder/sd15_text_encoder_sim.axmodel filter=lfs diff=lfs merge=lfs -text
|
models/text_encoder/sd15_text_encoder_sim.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82cde36ef8130294b1e908a60ba75a266e5543b7f606ba73998f3f7ee5cac243
|
| 3 |
+
size 175200001
|
run_img2img_axe_infer.py
CHANGED
|
@@ -5,7 +5,7 @@ import axengine
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
-
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
from diffusers.utils import load_image
|
|
@@ -353,9 +353,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
|
|
| 353 |
|
| 354 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 355 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 356 |
-
|
| 357 |
-
torch_dtype=torch.float32,
|
| 358 |
-
variant="fp16")
|
| 359 |
text_inputs = tokenizer(
|
| 360 |
prompt,
|
| 361 |
padding="max_length",
|
|
@@ -364,9 +362,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
|
|
| 364 |
return_tensors="pt",
|
| 365 |
)
|
| 366 |
text_input_ids = text_inputs.input_ids
|
| 367 |
-
prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
|
| 368 |
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
return prompt_embeds_npy
|
| 371 |
|
| 372 |
|
|
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
+
import os
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
from diffusers.utils import load_image
|
|
|
|
| 353 |
|
| 354 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 355 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 356 |
+
|
|
|
|
|
|
|
| 357 |
text_inputs = tokenizer(
|
| 358 |
prompt,
|
| 359 |
padding="max_length",
|
|
|
|
| 362 |
return_tensors="pt",
|
| 363 |
)
|
| 364 |
text_input_ids = text_inputs.input_ids
|
|
|
|
| 365 |
|
| 366 |
+
text_encoder = axengine.InferenceSession(
|
| 367 |
+
os.path.join(
|
| 368 |
+
text_encoder_dir,
|
| 369 |
+
"sd15_text_encoder_sim.axmodel"
|
| 370 |
+
),
|
| 371 |
+
)
|
| 372 |
+
text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
|
| 373 |
+
|
| 374 |
+
prompt_embeds_npy = text_encoder_onnx_out
|
| 375 |
return prompt_embeds_npy
|
| 376 |
|
| 377 |
|
run_img2img_onnx_infer.py
CHANGED
|
@@ -7,6 +7,7 @@ from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIP
|
|
| 7 |
|
| 8 |
# import axengine as axe
|
| 9 |
import time
|
|
|
|
| 10 |
import argparse
|
| 11 |
from diffusers.utils import load_image
|
| 12 |
import PIL.Image
|
|
@@ -353,9 +354,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
|
|
| 353 |
|
| 354 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 355 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 356 |
-
|
| 357 |
-
torch_dtype=torch.float32,
|
| 358 |
-
variant="fp16")
|
| 359 |
text_inputs = tokenizer(
|
| 360 |
prompt,
|
| 361 |
padding="max_length",
|
|
@@ -364,9 +363,17 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
|
|
| 364 |
return_tensors="pt",
|
| 365 |
)
|
| 366 |
text_input_ids = text_inputs.input_ids
|
| 367 |
-
prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
|
| 368 |
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
return prompt_embeds_npy
|
| 371 |
|
| 372 |
|
|
@@ -431,9 +438,9 @@ if __name__ == '__main__':
|
|
| 431 |
|
| 432 |
# load unet model and vae model
|
| 433 |
start = time.time()
|
| 434 |
-
vae_encoder = onnxruntime.InferenceSession(vae_encoder_model)
|
| 435 |
-
unet_session_main = onnxruntime.InferenceSession(unet_model)
|
| 436 |
-
vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
|
| 437 |
print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
|
| 438 |
|
| 439 |
# load time input file
|
|
|
|
| 7 |
|
| 8 |
# import axengine as axe
|
| 9 |
import time
|
| 10 |
+
import os
|
| 11 |
import argparse
|
| 12 |
from diffusers.utils import load_image
|
| 13 |
import PIL.Image
|
|
|
|
| 354 |
|
| 355 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 356 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 357 |
+
|
|
|
|
|
|
|
| 358 |
text_inputs = tokenizer(
|
| 359 |
prompt,
|
| 360 |
padding="max_length",
|
|
|
|
| 363 |
return_tensors="pt",
|
| 364 |
)
|
| 365 |
text_input_ids = text_inputs.input_ids
|
|
|
|
| 366 |
|
| 367 |
+
text_encoder = onnxruntime.InferenceSession(
|
| 368 |
+
os.path.join(
|
| 369 |
+
text_encoder_dir,
|
| 370 |
+
"sd15_text_encoder_sim.onnx"
|
| 371 |
+
),
|
| 372 |
+
providers=["CPUExecutionProvider"]
|
| 373 |
+
)
|
| 374 |
+
text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy()})[0]
|
| 375 |
+
|
| 376 |
+
prompt_embeds_npy = text_encoder_onnx_out
|
| 377 |
return prompt_embeds_npy
|
| 378 |
|
| 379 |
|
|
|
|
| 438 |
|
| 439 |
# load unet model and vae model
|
| 440 |
start = time.time()
|
| 441 |
+
vae_encoder = onnxruntime.InferenceSession(vae_encoder_model, providers=["CPUExecutionProvider"])
|
| 442 |
+
unet_session_main = onnxruntime.InferenceSession(unet_model, providers=["CPUExecutionProvider"])
|
| 443 |
+
vae_decoder = onnxruntime.InferenceSession(vae_decoder_model, providers=["CPUExecutionProvider"])
|
| 444 |
print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
|
| 445 |
|
| 446 |
# load time input file
|
run_txt2img_axe_infer.py
CHANGED
|
@@ -5,7 +5,7 @@ import axengine
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
-
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
|
|
@@ -55,9 +55,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
|
|
| 55 |
|
| 56 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 57 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 58 |
-
|
| 59 |
-
torch_dtype=torch.float32,
|
| 60 |
-
variant="fp16")
|
| 61 |
text_inputs = tokenizer(
|
| 62 |
prompt,
|
| 63 |
padding="max_length",
|
|
@@ -66,9 +64,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
|
|
| 66 |
return_tensors="pt",
|
| 67 |
)
|
| 68 |
text_input_ids = text_inputs.input_ids
|
| 69 |
-
prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
return prompt_embeds_npy
|
| 73 |
|
| 74 |
|
|
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
+
import os
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
|
|
|
|
| 55 |
|
| 56 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 57 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 58 |
+
|
|
|
|
|
|
|
| 59 |
text_inputs = tokenizer(
|
| 60 |
prompt,
|
| 61 |
padding="max_length",
|
|
|
|
| 64 |
return_tensors="pt",
|
| 65 |
)
|
| 66 |
text_input_ids = text_inputs.input_ids
|
|
|
|
| 67 |
|
| 68 |
+
text_encoder = axengine.InferenceSession(
|
| 69 |
+
os.path.join(
|
| 70 |
+
text_encoder_dir,
|
| 71 |
+
"sd15_text_encoder_sim.axmodel"
|
| 72 |
+
),
|
| 73 |
+
)
|
| 74 |
+
text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
|
| 75 |
+
|
| 76 |
+
prompt_embeds_npy = text_encoder_onnx_out
|
| 77 |
return prompt_embeds_npy
|
| 78 |
|
| 79 |
|
run_txt2img_axe_infer_new.py
CHANGED
|
@@ -57,9 +57,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
|
|
| 57 |
|
| 58 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 59 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 60 |
-
|
| 61 |
-
torch_dtype=torch.float32,
|
| 62 |
-
variant="fp16")
|
| 63 |
text_inputs = tokenizer(
|
| 64 |
prompt,
|
| 65 |
padding="max_length",
|
|
@@ -68,9 +66,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
|
|
| 68 |
return_tensors="pt",
|
| 69 |
)
|
| 70 |
text_input_ids = text_inputs.input_ids
|
| 71 |
-
prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
|
| 72 |
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
return prompt_embeds_npy
|
| 75 |
|
| 76 |
|
|
|
|
| 57 |
|
| 58 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 59 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 60 |
+
|
|
|
|
|
|
|
| 61 |
text_inputs = tokenizer(
|
| 62 |
prompt,
|
| 63 |
padding="max_length",
|
|
|
|
| 66 |
return_tensors="pt",
|
| 67 |
)
|
| 68 |
text_input_ids = text_inputs.input_ids
|
|
|
|
| 69 |
|
| 70 |
+
text_encoder = axengine.InferenceSession(
|
| 71 |
+
os.path.join(
|
| 72 |
+
text_encoder_dir,
|
| 73 |
+
"sd15_text_encoder_sim.axmodel"
|
| 74 |
+
),
|
| 75 |
+
)
|
| 76 |
+
text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
|
| 77 |
+
|
| 78 |
+
prompt_embeds_npy = text_encoder_onnx_out
|
| 79 |
return prompt_embeds_npy
|
| 80 |
|
| 81 |
|
run_txt2img_onnx_infer.py
CHANGED
|
@@ -5,7 +5,7 @@ import onnxruntime
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
-
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
|
|
@@ -55,9 +55,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
|
|
| 55 |
|
| 56 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 57 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 58 |
-
|
| 59 |
-
torch_dtype=torch.float32,
|
| 60 |
-
variant="fp16")
|
| 61 |
text_inputs = tokenizer(
|
| 62 |
prompt,
|
| 63 |
padding="max_length",
|
|
@@ -66,9 +64,17 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
|
|
| 66 |
return_tensors="pt",
|
| 67 |
)
|
| 68 |
text_input_ids = text_inputs.input_ids
|
| 69 |
-
prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
return prompt_embeds_npy
|
| 73 |
|
| 74 |
|
|
@@ -116,8 +122,8 @@ if __name__ == '__main__':
|
|
| 116 |
|
| 117 |
# load unet model and vae model
|
| 118 |
start = time.time()
|
| 119 |
-
unet_session_main = onnxruntime.InferenceSession(unet_model)
|
| 120 |
-
vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
|
| 121 |
print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
|
| 122 |
|
| 123 |
# load time input file
|
|
|
|
| 5 |
import torch
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
|
| 8 |
+
import os
|
| 9 |
import time
|
| 10 |
import argparse
|
| 11 |
|
|
|
|
| 55 |
|
| 56 |
def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
|
| 57 |
tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
|
| 58 |
+
|
|
|
|
|
|
|
| 59 |
text_inputs = tokenizer(
|
| 60 |
prompt,
|
| 61 |
padding="max_length",
|
|
|
|
| 64 |
return_tensors="pt",
|
| 65 |
)
|
| 66 |
text_input_ids = text_inputs.input_ids
|
|
|
|
| 67 |
|
| 68 |
+
text_encoder = onnxruntime.InferenceSession(
|
| 69 |
+
os.path.join(
|
| 70 |
+
text_encoder_dir,
|
| 71 |
+
"sd15_text_encoder_sim.onnx"
|
| 72 |
+
),
|
| 73 |
+
providers=["CPUExecutionProvider"]
|
| 74 |
+
)
|
| 75 |
+
text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy()})[0]
|
| 76 |
+
|
| 77 |
+
prompt_embeds_npy = text_encoder_onnx_out
|
| 78 |
return prompt_embeds_npy
|
| 79 |
|
| 80 |
|
|
|
|
| 122 |
|
| 123 |
# load unet model and vae model
|
| 124 |
start = time.time()
|
| 125 |
+
unet_session_main = onnxruntime.InferenceSession(unet_model, providers=["CPUExecutionProvider"])
|
| 126 |
+
vae_decoder = onnxruntime.InferenceSession(vae_decoder_model, providers=["CPUExecutionProvider"])
|
| 127 |
print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
|
| 128 |
|
| 129 |
# load time input file
|