yongqiang commited on
Commit
1e356cf
·
1 Parent(s): fb6a7b1

The text_encoder supports inference using onnx & axmodel

Browse files
.gitattributes CHANGED
@@ -43,3 +43,5 @@ models/unet.axmodel filter=lfs diff=lfs merge=lfs -text
43
  models/vae_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
44
  models/vae_encoder.axmodel filter=lfs diff=lfs merge=lfs -text
45
  models/7ffcf62c-d292-11ef-bb2a-9d527016cd35 filter=lfs diff=lfs merge=lfs -text
 
 
 
43
  models/vae_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
44
  models/vae_encoder.axmodel filter=lfs diff=lfs merge=lfs -text
45
  models/7ffcf62c-d292-11ef-bb2a-9d527016cd35 filter=lfs diff=lfs merge=lfs -text
46
+ models/text_encoder/sd15_text_encoder_sim.onnx filter=lfs diff=lfs merge=lfs -text
47
+ models/text_encoder/sd15_text_encoder_sim.axmodel filter=lfs diff=lfs merge=lfs -text
models/text_encoder/sd15_text_encoder_sim.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82cde36ef8130294b1e908a60ba75a266e5543b7f606ba73998f3f7ee5cac243
3
+ size 175200001
run_img2img_axe_infer.py CHANGED
@@ -5,7 +5,7 @@ import axengine
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
-
9
  import time
10
  import argparse
11
  from diffusers.utils import load_image
@@ -353,9 +353,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
353
 
354
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
355
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
356
- text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
357
- torch_dtype=torch.float32,
358
- variant="fp16")
359
  text_inputs = tokenizer(
360
  prompt,
361
  padding="max_length",
@@ -364,9 +362,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
364
  return_tensors="pt",
365
  )
366
  text_input_ids = text_inputs.input_ids
367
- prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
368
 
369
- prompt_embeds_npy = prompt_embeds[0].detach().numpy()
 
 
 
 
 
 
 
 
370
  return prompt_embeds_npy
371
 
372
 
 
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
+ import os
9
  import time
10
  import argparse
11
  from diffusers.utils import load_image
 
353
 
354
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
355
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
356
+
 
 
357
  text_inputs = tokenizer(
358
  prompt,
359
  padding="max_length",
 
362
  return_tensors="pt",
363
  )
364
  text_input_ids = text_inputs.input_ids
 
365
 
366
+ text_encoder = axengine.InferenceSession(
367
+ os.path.join(
368
+ text_encoder_dir,
369
+ "sd15_text_encoder_sim.axmodel"
370
+ ),
371
+ )
372
+ text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
373
+
374
+ prompt_embeds_npy = text_encoder_onnx_out
375
  return prompt_embeds_npy
376
 
377
 
run_img2img_onnx_infer.py CHANGED
@@ -7,6 +7,7 @@ from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIP
7
 
8
  # import axengine as axe
9
  import time
 
10
  import argparse
11
  from diffusers.utils import load_image
12
  import PIL.Image
@@ -353,9 +354,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
353
 
354
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
355
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
356
- text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
357
- torch_dtype=torch.float32,
358
- variant="fp16")
359
  text_inputs = tokenizer(
360
  prompt,
361
  padding="max_length",
@@ -364,9 +363,17 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
364
  return_tensors="pt",
365
  )
366
  text_input_ids = text_inputs.input_ids
367
- prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
368
 
369
- prompt_embeds_npy = prompt_embeds[0].detach().numpy()
 
 
 
 
 
 
 
 
 
370
  return prompt_embeds_npy
371
 
372
 
@@ -431,9 +438,9 @@ if __name__ == '__main__':
431
 
432
  # load unet model and vae model
433
  start = time.time()
434
- vae_encoder = onnxruntime.InferenceSession(vae_encoder_model)
435
- unet_session_main = onnxruntime.InferenceSession(unet_model)
436
- vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
437
  print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
438
 
439
  # load time input file
 
7
 
8
  # import axengine as axe
9
  import time
10
+ import os
11
  import argparse
12
  from diffusers.utils import load_image
13
  import PIL.Image
 
354
 
355
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
356
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
357
+
 
 
358
  text_inputs = tokenizer(
359
  prompt,
360
  padding="max_length",
 
363
  return_tensors="pt",
364
  )
365
  text_input_ids = text_inputs.input_ids
 
366
 
367
+ text_encoder = onnxruntime.InferenceSession(
368
+ os.path.join(
369
+ text_encoder_dir,
370
+ "sd15_text_encoder_sim.onnx"
371
+ ),
372
+ providers=["CPUExecutionProvider"]
373
+ )
374
+ text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy()})[0]
375
+
376
+ prompt_embeds_npy = text_encoder_onnx_out
377
  return prompt_embeds_npy
378
 
379
 
 
438
 
439
  # load unet model and vae model
440
  start = time.time()
441
+ vae_encoder = onnxruntime.InferenceSession(vae_encoder_model, providers=["CPUExecutionProvider"])
442
+ unet_session_main = onnxruntime.InferenceSession(unet_model, providers=["CPUExecutionProvider"])
443
+ vae_decoder = onnxruntime.InferenceSession(vae_decoder_model, providers=["CPUExecutionProvider"])
444
  print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
445
 
446
  # load time input file
run_txt2img_axe_infer.py CHANGED
@@ -5,7 +5,7 @@ import axengine
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
-
9
  import time
10
  import argparse
11
 
@@ -55,9 +55,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
55
 
56
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
57
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
58
- text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
59
- torch_dtype=torch.float32,
60
- variant="fp16")
61
  text_inputs = tokenizer(
62
  prompt,
63
  padding="max_length",
@@ -66,9 +64,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
66
  return_tensors="pt",
67
  )
68
  text_input_ids = text_inputs.input_ids
69
- prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
70
 
71
- prompt_embeds_npy = prompt_embeds[0].detach().numpy()
 
 
 
 
 
 
 
 
72
  return prompt_embeds_npy
73
 
74
 
 
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
+ import os
9
  import time
10
  import argparse
11
 
 
55
 
56
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
57
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
58
+
 
 
59
  text_inputs = tokenizer(
60
  prompt,
61
  padding="max_length",
 
64
  return_tensors="pt",
65
  )
66
  text_input_ids = text_inputs.input_ids
 
67
 
68
+ text_encoder = axengine.InferenceSession(
69
+ os.path.join(
70
+ text_encoder_dir,
71
+ "sd15_text_encoder_sim.axmodel"
72
+ ),
73
+ )
74
+ text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
75
+
76
+ prompt_embeds_npy = text_encoder_onnx_out
77
  return prompt_embeds_npy
78
 
79
 
run_txt2img_axe_infer_new.py CHANGED
@@ -57,9 +57,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
57
 
58
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
59
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
60
- text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
61
- torch_dtype=torch.float32,
62
- variant="fp16")
63
  text_inputs = tokenizer(
64
  prompt,
65
  padding="max_length",
@@ -68,9 +66,16 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
68
  return_tensors="pt",
69
  )
70
  text_input_ids = text_inputs.input_ids
71
- prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
72
 
73
- prompt_embeds_npy = prompt_embeds[0].detach().numpy()
 
 
 
 
 
 
 
 
74
  return prompt_embeds_npy
75
 
76
 
 
57
 
58
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
59
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
60
+
 
 
61
  text_inputs = tokenizer(
62
  prompt,
63
  padding="max_length",
 
66
  return_tensors="pt",
67
  )
68
  text_input_ids = text_inputs.input_ids
 
69
 
70
+ text_encoder = axengine.InferenceSession(
71
+ os.path.join(
72
+ text_encoder_dir,
73
+ "sd15_text_encoder_sim.axmodel"
74
+ ),
75
+ )
76
+ text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy().astype(np.int32)})[0]
77
+
78
+ prompt_embeds_npy = text_encoder_onnx_out
79
  return prompt_embeds_npy
80
 
81
 
run_txt2img_onnx_infer.py CHANGED
@@ -5,7 +5,7 @@ import onnxruntime
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
-
9
  import time
10
  import argparse
11
 
@@ -55,9 +55,7 @@ def _maybe_convert_prompt(prompt: str, tokenizer: "PreTrainedTokenizer"): # noq
55
 
56
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
57
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
58
- text_encoder = CLIPTextModel.from_pretrained(text_encoder_dir,
59
- torch_dtype=torch.float32,
60
- variant="fp16")
61
  text_inputs = tokenizer(
62
  prompt,
63
  padding="max_length",
@@ -66,9 +64,17 @@ def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/t
66
  return_tensors="pt",
67
  )
68
  text_input_ids = text_inputs.input_ids
69
- prompt_embeds = text_encoder(text_input_ids.to("cpu"), attention_mask=None)
70
 
71
- prompt_embeds_npy = prompt_embeds[0].detach().numpy()
 
 
 
 
 
 
 
 
 
72
  return prompt_embeds_npy
73
 
74
 
@@ -116,8 +122,8 @@ if __name__ == '__main__':
116
 
117
  # load unet model and vae model
118
  start = time.time()
119
- unet_session_main = onnxruntime.InferenceSession(unet_model)
120
- vae_decoder = onnxruntime.InferenceSession(vae_decoder_model)
121
  print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
122
 
123
  # load time input file
 
5
  import torch
6
  from PIL import Image
7
  from transformers import CLIPTokenizer, CLIPTextModel, PreTrainedTokenizer, CLIPTextModelWithProjection
8
+ import os
9
  import time
10
  import argparse
11
 
 
55
 
56
  def get_embeds(prompt = "Portrait of a pretty girl", tokenizer_dir = "./models/tokenizer", text_encoder_dir = "./models/text_encoder"):
57
  tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
58
+
 
 
59
  text_inputs = tokenizer(
60
  prompt,
61
  padding="max_length",
 
64
  return_tensors="pt",
65
  )
66
  text_input_ids = text_inputs.input_ids
 
67
 
68
+ text_encoder = onnxruntime.InferenceSession(
69
+ os.path.join(
70
+ text_encoder_dir,
71
+ "sd15_text_encoder_sim.onnx"
72
+ ),
73
+ providers=["CPUExecutionProvider"]
74
+ )
75
+ text_encoder_onnx_out = text_encoder.run(None, {"input_ids": text_input_ids.to("cpu").numpy()})[0]
76
+
77
+ prompt_embeds_npy = text_encoder_onnx_out
78
  return prompt_embeds_npy
79
 
80
 
 
122
 
123
  # load unet model and vae model
124
  start = time.time()
125
+ unet_session_main = onnxruntime.InferenceSession(unet_model, providers=["CPUExecutionProvider"])
126
+ vae_decoder = onnxruntime.InferenceSession(vae_decoder_model, providers=["CPUExecutionProvider"])
127
  print(f"load models take {(1000 * (time.time() - start)):.1f}ms")
128
 
129
  # load time input file