yamildiego commited on
Commit
1864f62
·
1 Parent(s): d26bf0e
Files changed (3) hide show
  1. 5_payload copy.json +9 -0
  2. handler.py +134 -176
  3. handler_old.py +317 -0
5_payload copy.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "face_image_path": "https://i.ibb.co/Px1WgFt/Whats-App-Image-2024-03-18-at-15-59-01.jpg",
3
+ "pose_image_path": "https://i.ibb.co/Px1WgFt/Whats-App-Image-2024-03-18-at-15-59-01.jpg",
4
+ "inputs": "a man",
5
+ "negative_prompt": "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy",
6
+ "guidance_scale": 5.0,
7
+ "num_inference_steps": 20,
8
+ "style_name": "Spring Festival"
9
+ }
handler.py CHANGED
@@ -1,81 +1,74 @@
1
  import cv2
 
 
2
  import numpy as np
3
 
 
 
 
 
4
  import diffusers
 
5
  from diffusers.models import ControlNetModel
6
  from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
7
- from diffusers.utils import load_image
8
 
9
- import torch
10
- import torch.nn.functional as F
11
- from torchvision.transforms import Compose
12
- from style_template import styles
13
-
14
- from PIL import Image
15
-
16
- from depth_anything.dpt import DepthAnything
17
- from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
18
 
19
  from insightface.app import FaceAnalysis
20
- from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
21
- from controlnet_aux import OpenposeDetector
22
 
 
 
23
 
24
- STYLE_NAMES = list(styles.keys())
25
- DEFAULT_STYLE_NAME = "Mars"
26
 
27
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
- if device.type != 'cuda':
29
- raise ValueError("Se requiere ejecutar en GPU")
30
 
 
 
31
  dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
 
 
32
 
33
  class EndpointHandler():
34
  def __init__(self, model_dir):
35
 
36
- print("Loading FaceAnalysis", model_dir)
 
 
 
 
 
 
37
 
 
38
  # self.app = FaceAnalysis(
39
  # name="antelopev2",
40
- # root=f"./antelopev2",
41
  # providers=["CPUExecutionProvider"],
42
  # )
43
-
44
  self.app = FaceAnalysis(
45
- name="buffalo_l",
46
- root="./",
47
- providers=["CPUExecutionProvider"],
48
  )
49
 
50
  self.app.prepare(ctx_id=0, det_size=(640, 640))
51
-
52
  openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
53
- depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(device).eval()
54
-
55
- transform = Compose([
56
- Resize(
57
- width=518,
58
- height=518,
59
- resize_target=False,
60
- keep_aspect_ratio=True,
61
- ensure_multiple_of=14,
62
- resize_method='lower_bound',
63
- image_interpolation_method=cv2.INTER_CUBIC,
64
- ),
65
- NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
66
- PrepareForNet(),
67
- ])
68
-
69
- face_adapter = f"/repository/checkpoints/ip-adapter.bin"
70
- controlnet_path = f"/repository/checkpoints/ControlNetModel"
71
-
72
- self.controlnet_identitynet = ControlNetModel.from_pretrained(
73
  controlnet_path, torch_dtype=dtype
74
  )
75
-
 
76
  controlnet_pose_model = "thibaud/controlnet-openpose-sdxl-1.0"
77
  controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
78
- controlnet_depth_model = "diffusers/controlnet-depth-sdxl-1.0-small"
79
 
80
  controlnet_pose = ControlNetModel.from_pretrained(
81
  controlnet_pose_model, torch_dtype=dtype
@@ -83,58 +76,32 @@ class EndpointHandler():
83
  controlnet_canny = ControlNetModel.from_pretrained(
84
  controlnet_canny_model, torch_dtype=dtype
85
  ).to(device)
86
- controlnet_depth = ControlNetModel.from_pretrained(
87
- controlnet_depth_model, torch_dtype=dtype
88
- ).to(device)
89
-
90
- def get_depth_map(image):
91
-
92
- image = np.array(image) / 255.0
93
-
94
- h, w = image.shape[:2]
95
-
96
- image = transform({'image': image})['image']
97
- image = torch.from_numpy(image).unsqueeze(0).to("cuda")
98
-
99
- with torch.no_grad():
100
- depth = depth_anything(image)
101
-
102
- depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
103
- depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
104
 
105
- depth = depth.cpu().numpy().astype(np.uint8)
106
-
107
- depth_image = Image.fromarray(depth)
108
-
109
- return depth_image
110
-
111
  def get_canny_image(image, t1=100, t2=200):
112
  image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
113
  edges = cv2.Canny(image, t1, t2)
114
  return Image.fromarray(edges, "L")
115
-
116
  self.controlnet_map = {
117
  "pose": controlnet_pose,
118
- "canny": controlnet_canny,
119
- "depth": controlnet_depth,
120
  }
121
 
122
  self.controlnet_map_fn = {
123
  "pose": openpose,
124
- "canny": get_canny_image,
125
- "depth": get_depth_map,
126
  }
127
 
128
  pretrained_model_name_or_path = "wangqixun/YamerMIX_v8"
129
 
130
  self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
131
- pretrained_model_name_or_path,
132
- controlnet=[self.controlnet_identitynet],
133
- torch_dtype=dtype,
134
- safety_checker=None,
135
- feature_extractor=None,
136
  ).to(device)
137
-
138
  self.pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(
139
  self.pipe.scheduler.config
140
  )
@@ -142,62 +109,14 @@ class EndpointHandler():
142
  # load and disable LCM
143
  self.pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
144
  self.pipe.disable_lora()
145
-
146
  self.pipe.cuda()
147
  self.pipe.load_ip_adapter_instantid(face_adapter)
148
  self.pipe.image_proj_model.to("cuda")
149
  self.pipe.unet.to("cuda")
150
 
151
- # if we need more parameters
152
- scheduler_class_name = "EulerDiscreteScheduler"
153
- add_kwargs = {}
154
- scheduler = getattr(diffusers, scheduler_class_name)
155
- self.pipe.scheduler = scheduler.from_config(self.pipe.scheduler.config, **add_kwargs)
156
-
157
- identitynet_strength_ratio = 0.8
158
-
159
- pose_strength = 0.5
160
- canny_strength = 0.3
161
- depth_strength = 0.5
162
-
163
- self.my_controlnet_selection = ["pose", "canny"]
164
-
165
- controlnet_scales = {
166
- "pose": pose_strength,
167
- "canny": canny_strength,
168
- "depth": depth_strength,
169
- }
170
-
171
- self.pipe.controlnet = MultiControlNetModel(
172
- [self.controlnet_identitynet]
173
- + [self.controlnet_map[s] for s in self.my_controlnet_selection]
174
- )
175
- self.control_scales = [float(identitynet_strength_ratio)] + [
176
- controlnet_scales[s] for s in self.my_controlnet_selection
177
- ]
178
-
179
  def __call__(self, data):
180
 
181
-
182
- def apply_style(style_name: str, positive: str) -> str:
183
- p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
184
- return p.replace("{prompt}", positive)
185
-
186
- default_negative_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
187
-
188
- # hyperparamters
189
- face_image_path = data.pop("face_image_path", "https://i.ibb.co/GQzm527/examples-musk-resize.jpg")
190
- pose_image_path = data.pop("pose_image_path", "https://i.ibb.co/TRCK4MS/examples-poses-pose2.jpg")
191
- prompt_input = data.pop("inputs", "a man flying in the sky in Mars")
192
- num_inference_steps = data.pop("num_inference_steps", 20)
193
- guidance_scale = data.pop("guidance_scale", 5.0)
194
- negative_prompt = data.pop("negative_prompt", default_negative_prompt)
195
- style_name = data.pop("style_name", DEFAULT_STYLE_NAME)
196
-
197
- prompt = apply_style(style_name, prompt_input)
198
-
199
- adapter_strength_ratio = 0.8
200
-
201
  def convert_from_cv2_to_image(img: np.ndarray) -> Image:
202
  return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
203
 
@@ -210,38 +129,67 @@ class EndpointHandler():
210
  min_side=1024,
211
  size=None,
212
  pad_to_max_side=False,
213
- mode=Image.BILINEAR,
214
  base_pixel_number=64,
215
  ):
 
216
  if size is not None:
217
  w_resize_new, h_resize_new = size
218
  else:
219
- w, h = input_image.size
220
- # Calcular el redimensionamiento con un solo paso
221
- ratio_min = min_side / min(w, h)
222
- w_min, h_min = round(ratio_min * w), round(ratio_min * h)
223
- ratio_max = max_side / max(w_min, h_min)
224
- # Aplicar la menor de las dos ratios para asegurar que cumple ambas condiciones
225
- final_ratio = min(ratio_min, ratio_max)
226
- w_final, h_final = round(final_ratio * w), round(final_ratio * h)
227
-
228
- # Ajustar al número base de píxeles más cercano
229
- w_resize_new = (w_final // base_pixel_number) * base_pixel_number
230
- h_resize_new = (h_final // base_pixel_number) * base_pixel_number
231
-
232
- # Redimensionar una sola vez
233
  input_image = input_image.resize([w_resize_new, h_resize_new], mode)
234
 
235
  if pad_to_max_side:
236
- # Optimizar la creación del fondo
237
- res = Image.new("RGB", (max_side, max_side), (255, 255, 255))
238
  offset_x = (max_side - w_resize_new) // 2
239
  offset_y = (max_side - h_resize_new) // 2
240
- res.paste(input_image, (offset_x, offset_y))
241
- return res
242
-
 
243
  return input_image
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  face_image = load_image(face_image_path)
246
  face_image = resize_img(face_image, max_side=1024)
247
  face_image_cv2 = convert_from_image_to_cv2(face_image)
@@ -249,7 +197,7 @@ class EndpointHandler():
249
 
250
  # Extract face features
251
  face_info = self.app.get(face_image_cv2)
252
-
253
  # if len(face_info) == 0:
254
  # raise gr.Error(
255
  # f"Unable to detect a face in the image. Please upload a different photo with a clear face."
@@ -260,44 +208,54 @@ class EndpointHandler():
260
  key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
261
  )[
262
  -1
263
- ]
264
-
265
  face_emb = face_info["embedding"]
266
  face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
267
  img_controlnet = face_image
268
-
269
- pose_image = load_image(pose_image_path)
270
- pose_image = resize_img(pose_image, max_side=1024)
271
- img_controlnet = pose_image
272
- pose_image_cv2 = convert_from_image_to_cv2(pose_image)
273
 
274
- face_info = self.app.get(pose_image_cv2)
275
 
276
- # get error if no face is detected
277
- # if len(face_info) == 0:
278
- # raise gr.Error(
279
- # f"Cannot find any face in the reference image! Please upload another person image"
280
- # )
281
 
282
- face_info = face_info[-1]
283
- face_kps = draw_kps(pose_image, face_info["kps"])
284
 
285
- width, height = face_kps.size
286
 
287
  control_mask = np.zeros([height, width, 3])
288
  x1, y1, x2, y2 = face_info["bbox"]
289
  x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
290
  control_mask[y1:y2, x1:x2] = 255
291
  control_mask = Image.fromarray(control_mask.astype(np.uint8))
292
-
 
 
 
 
 
 
 
 
 
 
 
293
  control_images = [face_kps] + [
294
  self.controlnet_map_fn[s](img_controlnet).resize((width, height))
295
- for s in self.my_controlnet_selection
296
  ]
297
 
298
- print("Start inference...")
299
 
300
- self.generator = torch.Generator(device=device).manual_seed(42)
 
301
 
302
  self.pipe.set_ip_adapter_scale(adapter_strength_ratio)
303
  images = self.pipe(
@@ -306,12 +264,12 @@ class EndpointHandler():
306
  image_embeds=face_emb,
307
  image=control_images,
308
  control_mask=control_mask,
309
- controlnet_conditioning_scale=self.control_scales,
310
- num_inference_steps=num_inference_steps,
311
  guidance_scale=guidance_scale,
312
  height=height,
313
  width=width,
314
- generator=self.generator,
315
  ).images
316
-
317
- return images[0]
 
1
  import cv2
2
+ import torch
3
+ import random
4
  import numpy as np
5
 
6
+ import PIL
7
+ from PIL import Image
8
+ from typing import Tuple
9
+
10
  import diffusers
11
+ from diffusers.utils import load_image
12
  from diffusers.models import ControlNetModel
13
  from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 
14
 
15
+ from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
16
 
17
  from insightface.app import FaceAnalysis
 
 
18
 
19
+ from style_template import styles
20
+ from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
21
 
22
+ from controlnet_aux import OpenposeDetector
 
23
 
24
+ import torch.nn.functional as F
25
+ from torchvision.transforms import Compose
 
26
 
27
+ # global variable
28
+ device = "cuda" if torch.cuda.is_available() else "cpu"
29
  dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
30
+ STYLE_NAMES = list(styles.keys())
31
+ DEFAULT_STYLE_NAME = "Spring Festival"
32
 
33
  class EndpointHandler():
34
  def __init__(self, model_dir):
35
 
36
+ hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints")
37
+ hf_hub_download(
38
+ repo_id="InstantX/InstantID",
39
+ filename="ControlNetModel/diffusion_pytorch_model.safetensors",
40
+ local_dir="./checkpoints",
41
+ )
42
+ hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints")
43
 
44
+ # Load face encoder
45
  # self.app = FaceAnalysis(
46
  # name="antelopev2",
47
+ # root="./",
48
  # providers=["CPUExecutionProvider"],
49
  # )
 
50
  self.app = FaceAnalysis(
51
+ name="buffalo_l",
52
+ root="./",
53
+ providers=["CPUExecutionProvider"],
54
  )
55
 
56
  self.app.prepare(ctx_id=0, det_size=(640, 640))
57
+
58
  openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
59
+
60
+ # Path to InstantID models
61
+ face_adapter = f"./checkpoints/ip-adapter.bin"
62
+ controlnet_path = f"./checkpoints/ControlNetModel"
63
+
64
+ # Load pipeline face ControlNetModel
65
+ controlnet_identitynet = ControlNetModel.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  controlnet_path, torch_dtype=dtype
67
  )
68
+
69
+ # controlnet-pose
70
  controlnet_pose_model = "thibaud/controlnet-openpose-sdxl-1.0"
71
  controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
 
72
 
73
  controlnet_pose = ControlNetModel.from_pretrained(
74
  controlnet_pose_model, torch_dtype=dtype
 
76
  controlnet_canny = ControlNetModel.from_pretrained(
77
  controlnet_canny_model, torch_dtype=dtype
78
  ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
 
 
 
 
80
  def get_canny_image(image, t1=100, t2=200):
81
  image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
82
  edges = cv2.Canny(image, t1, t2)
83
  return Image.fromarray(edges, "L")
84
+
85
  self.controlnet_map = {
86
  "pose": controlnet_pose,
87
+ "canny": controlnet_canny
 
88
  }
89
 
90
  self.controlnet_map_fn = {
91
  "pose": openpose,
92
+ "canny": get_canny_image
 
93
  }
94
 
95
  pretrained_model_name_or_path = "wangqixun/YamerMIX_v8"
96
 
97
  self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
98
+ pretrained_model_name_or_path,
99
+ controlnet=[controlnet_identitynet],
100
+ torch_dtype=dtype,
101
+ safety_checker=None,
102
+ feature_extractor=None,
103
  ).to(device)
104
+
105
  self.pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(
106
  self.pipe.scheduler.config
107
  )
 
109
  # load and disable LCM
110
  self.pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
111
  self.pipe.disable_lora()
112
+
113
  self.pipe.cuda()
114
  self.pipe.load_ip_adapter_instantid(face_adapter)
115
  self.pipe.image_proj_model.to("cuda")
116
  self.pipe.unet.to("cuda")
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def __call__(self, data):
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def convert_from_cv2_to_image(img: np.ndarray) -> Image:
121
  return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
122
 
 
129
  min_side=1024,
130
  size=None,
131
  pad_to_max_side=False,
132
+ mode=PIL.Image.BILINEAR,
133
  base_pixel_number=64,
134
  ):
135
+ w, h = input_image.size
136
  if size is not None:
137
  w_resize_new, h_resize_new = size
138
  else:
139
+ ratio = min_side / min(h, w)
140
+ w, h = round(ratio * w), round(ratio * h)
141
+ ratio = max_side / max(h, w)
142
+ input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
143
+ w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
144
+ h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
 
 
 
 
 
 
 
 
145
  input_image = input_image.resize([w_resize_new, h_resize_new], mode)
146
 
147
  if pad_to_max_side:
148
+ res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
 
149
  offset_x = (max_side - w_resize_new) // 2
150
  offset_y = (max_side - h_resize_new) // 2
151
+ res[
152
+ offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
153
+ ] = np.array(input_image)
154
+ input_image = Image.fromarray(res)
155
  return input_image
156
 
157
+ def apply_style(
158
+ style_name: str, positive: str, negative: str = ""
159
+ ) -> Tuple[str, str]:
160
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
161
+ return p.replace("{prompt}", positive), n + " " + negative
162
+
163
+
164
+
165
+ face_image_path = data.pop("face_image_path", "https://i.ibb.co/GQzm527/examples-musk-resize.jpg")
166
+ pose_image_path = data.pop("pose_image_path", "https://i.ibb.co/TRCK4MS/examples-poses-pose2.jpg")
167
+ style_name = data.pop("style_name", DEFAULT_STYLE_NAME)
168
+ prompt = data.pop("inputs", "a man flying in the sky in Mars")
169
+
170
+ identitynet_strength_ratio = 0.8
171
+ adapter_strength_ratio = 0.8
172
+ pose_strength = 0.5
173
+ canny_strength = 0.3
174
+ num_steps = 20
175
+ guidance_scale = 5.0
176
+ controlnet_selection = ["pose", "canny"]
177
+ scheduler = "EulerDiscreteScheduler"
178
+
179
+ self.pipe.disable_lora()
180
+ scheduler_class_name = scheduler.split("-")[0]
181
+
182
+ add_kwargs = {}
183
+ if len(scheduler.split("-")) > 1:
184
+ add_kwargs["use_karras_sigmas"] = True
185
+ if len(scheduler.split("-")) > 2:
186
+ add_kwargs["algorithm_type"] = "sde-dpmsolver++"
187
+ scheduler = getattr(diffusers, scheduler_class_name)
188
+ self.pipe.scheduler = scheduler.from_config(self.pipe.scheduler.config, **add_kwargs)
189
+
190
+ # apply the style template
191
+ prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
192
+
193
  face_image = load_image(face_image_path)
194
  face_image = resize_img(face_image, max_side=1024)
195
  face_image_cv2 = convert_from_image_to_cv2(face_image)
 
197
 
198
  # Extract face features
199
  face_info = self.app.get(face_image_cv2)
200
+
201
  # if len(face_info) == 0:
202
  # raise gr.Error(
203
  # f"Unable to detect a face in the image. Please upload a different photo with a clear face."
 
208
  key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
209
  )[
210
  -1
211
+ ] # only use the maximum face
 
212
  face_emb = face_info["embedding"]
213
  face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
214
  img_controlnet = face_image
215
+ if pose_image_path is not None:
216
+ pose_image = load_image(pose_image_path)
217
+ pose_image = resize_img(pose_image, max_side=1024)
218
+ img_controlnet = pose_image
219
+ pose_image_cv2 = convert_from_image_to_cv2(pose_image)
220
 
221
+ face_info = self.app.get(pose_image_cv2)
222
 
223
+ # if len(face_info) == 0:
224
+ # raise gr.Error(
225
+ # f"Cannot find any face in the reference image! Please upload another person image"
226
+ # )
 
227
 
228
+ face_info = face_info[-1]
229
+ face_kps = draw_kps(pose_image, face_info["kps"])
230
 
231
+ width, height = face_kps.size
232
 
233
  control_mask = np.zeros([height, width, 3])
234
  x1, y1, x2, y2 = face_info["bbox"]
235
  x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
236
  control_mask[y1:y2, x1:x2] = 255
237
  control_mask = Image.fromarray(control_mask.astype(np.uint8))
238
+
239
+ controlnet_scales = {
240
+ "pose": pose_strength,
241
+ "canny": canny_strength
242
+ }
243
+ self.pipe.controlnet = MultiControlNetModel(
244
+ [self.controlnet_identitynet]
245
+ + [self.controlnet_map[s] for s in controlnet_selection]
246
+ )
247
+ control_scales = [float(identitynet_strength_ratio)] + [
248
+ controlnet_scales[s] for s in controlnet_selection
249
+ ]
250
  control_images = [face_kps] + [
251
  self.controlnet_map_fn[s](img_controlnet).resize((width, height))
252
+ for s in controlnet_selection
253
  ]
254
 
255
+ generator = torch.Generator(device=device).manual_seed(42)
256
 
257
+ print("Start inference...")
258
+ print(f"[Debug] Prompt: {prompt}, \n[Debug] Neg Prompt: {negative_prompt}")
259
 
260
  self.pipe.set_ip_adapter_scale(adapter_strength_ratio)
261
  images = self.pipe(
 
264
  image_embeds=face_emb,
265
  image=control_images,
266
  control_mask=control_mask,
267
+ controlnet_conditioning_scale=control_scales,
268
+ num_inference_steps=num_steps,
269
  guidance_scale=guidance_scale,
270
  height=height,
271
  width=width,
272
+ generator=generator,
273
  ).images
274
+
275
+ return images[0]
handler_old.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import diffusers
5
+ from diffusers.models import ControlNetModel
6
+ from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
7
+ from diffusers.utils import load_image
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ from torchvision.transforms import Compose
12
+ from style_template import styles
13
+
14
+ from PIL import Image
15
+
16
+ from depth_anything.dpt import DepthAnything
17
+ from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
18
+
19
+ from insightface.app import FaceAnalysis
20
+ from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
21
+ from controlnet_aux import OpenposeDetector
22
+
23
+
24
+ STYLE_NAMES = list(styles.keys())
25
+ DEFAULT_STYLE_NAME = "Mars"
26
+
27
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
28
+ if device.type != 'cuda':
29
+ raise ValueError("Se requiere ejecutar en GPU")
30
+
31
+ dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
32
+
33
+ class EndpointHandler():
34
+ def __init__(self, model_dir):
35
+
36
+ print("Loading FaceAnalysis", model_dir)
37
+
38
+ # self.app = FaceAnalysis(
39
+ # name="antelopev2",
40
+ # root=f"./antelopev2",
41
+ # providers=["CPUExecutionProvider"],
42
+ # )
43
+
44
+ self.app = FaceAnalysis(
45
+ name="buffalo_l",
46
+ root="./",
47
+ providers=["CPUExecutionProvider"],
48
+ )
49
+
50
+ self.app.prepare(ctx_id=0, det_size=(640, 640))
51
+
52
+ openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
53
+ depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(device).eval()
54
+
55
+ transform = Compose([
56
+ Resize(
57
+ width=518,
58
+ height=518,
59
+ resize_target=False,
60
+ keep_aspect_ratio=True,
61
+ ensure_multiple_of=14,
62
+ resize_method='lower_bound',
63
+ image_interpolation_method=cv2.INTER_CUBIC,
64
+ ),
65
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
66
+ PrepareForNet(),
67
+ ])
68
+
69
+ face_adapter = f"/repository/checkpoints/ip-adapter.bin"
70
+ controlnet_path = f"/repository/checkpoints/ControlNetModel"
71
+
72
+ self.controlnet_identitynet = ControlNetModel.from_pretrained(
73
+ controlnet_path, torch_dtype=dtype
74
+ )
75
+
76
+ controlnet_pose_model = "thibaud/controlnet-openpose-sdxl-1.0"
77
+ controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
78
+ controlnet_depth_model = "diffusers/controlnet-depth-sdxl-1.0-small"
79
+
80
+ controlnet_pose = ControlNetModel.from_pretrained(
81
+ controlnet_pose_model, torch_dtype=dtype
82
+ ).to(device)
83
+ controlnet_canny = ControlNetModel.from_pretrained(
84
+ controlnet_canny_model, torch_dtype=dtype
85
+ ).to(device)
86
+ controlnet_depth = ControlNetModel.from_pretrained(
87
+ controlnet_depth_model, torch_dtype=dtype
88
+ ).to(device)
89
+
90
+ def get_depth_map(image):
91
+
92
+ image = np.array(image) / 255.0
93
+
94
+ h, w = image.shape[:2]
95
+
96
+ image = transform({'image': image})['image']
97
+ image = torch.from_numpy(image).unsqueeze(0).to("cuda")
98
+
99
+ with torch.no_grad():
100
+ depth = depth_anything(image)
101
+
102
+ depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
103
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
104
+
105
+ depth = depth.cpu().numpy().astype(np.uint8)
106
+
107
+ depth_image = Image.fromarray(depth)
108
+
109
+ return depth_image
110
+
111
+ def get_canny_image(image, t1=100, t2=200):
112
+ image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
113
+ edges = cv2.Canny(image, t1, t2)
114
+ return Image.fromarray(edges, "L")
115
+
116
+ self.controlnet_map = {
117
+ "pose": controlnet_pose,
118
+ "canny": controlnet_canny,
119
+ "depth": controlnet_depth,
120
+ }
121
+
122
+ self.controlnet_map_fn = {
123
+ "pose": openpose,
124
+ "canny": get_canny_image,
125
+ "depth": get_depth_map,
126
+ }
127
+
128
+ pretrained_model_name_or_path = "wangqixun/YamerMIX_v8"
129
+
130
+ self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
131
+ pretrained_model_name_or_path,
132
+ controlnet=[self.controlnet_identitynet],
133
+ torch_dtype=dtype,
134
+ safety_checker=None,
135
+ feature_extractor=None,
136
+ ).to(device)
137
+
138
+ self.pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(
139
+ self.pipe.scheduler.config
140
+ )
141
+
142
+ # load and disable LCM
143
+ self.pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
144
+ self.pipe.disable_lora()
145
+
146
+ self.pipe.cuda()
147
+ self.pipe.load_ip_adapter_instantid(face_adapter)
148
+ self.pipe.image_proj_model.to("cuda")
149
+ self.pipe.unet.to("cuda")
150
+
151
+ # if we need more parameters
152
+ scheduler_class_name = "EulerDiscreteScheduler"
153
+ add_kwargs = {}
154
+ scheduler = getattr(diffusers, scheduler_class_name)
155
+ self.pipe.scheduler = scheduler.from_config(self.pipe.scheduler.config, **add_kwargs)
156
+
157
+ identitynet_strength_ratio = 0.8
158
+
159
+ pose_strength = 0.5
160
+ canny_strength = 0.3
161
+ depth_strength = 0.5
162
+
163
+ self.my_controlnet_selection = ["pose", "canny"]
164
+
165
+ controlnet_scales = {
166
+ "pose": pose_strength,
167
+ "canny": canny_strength,
168
+ "depth": depth_strength,
169
+ }
170
+
171
+ self.pipe.controlnet = MultiControlNetModel(
172
+ [self.controlnet_identitynet]
173
+ + [self.controlnet_map[s] for s in self.my_controlnet_selection]
174
+ )
175
+ self.control_scales = [float(identitynet_strength_ratio)] + [
176
+ controlnet_scales[s] for s in self.my_controlnet_selection
177
+ ]
178
+
179
+ def __call__(self, data):
180
+
181
+
182
+ def apply_style(style_name: str, positive: str) -> str:
183
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
184
+ return p.replace("{prompt}", positive)
185
+
186
+ default_negative_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
187
+
188
+ # hyperparamters
189
+ face_image_path = data.pop("face_image_path", "https://i.ibb.co/GQzm527/examples-musk-resize.jpg")
190
+ pose_image_path = data.pop("pose_image_path", "https://i.ibb.co/TRCK4MS/examples-poses-pose2.jpg")
191
+ prompt_input = data.pop("inputs", "a man flying in the sky in Mars")
192
+ num_inference_steps = data.pop("num_inference_steps", 20)
193
+ guidance_scale = data.pop("guidance_scale", 5.0)
194
+ negative_prompt = data.pop("negative_prompt", default_negative_prompt)
195
+ style_name = data.pop("style_name", DEFAULT_STYLE_NAME)
196
+
197
+ prompt = apply_style(style_name, prompt_input)
198
+
199
+ adapter_strength_ratio = 0.8
200
+
201
+ def convert_from_cv2_to_image(img: np.ndarray) -> Image:
202
+ return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
203
+
204
+ def convert_from_image_to_cv2(img: Image) -> np.ndarray:
205
+ return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
206
+
207
+ def resize_img(
208
+ input_image,
209
+ max_side=1280,
210
+ min_side=1024,
211
+ size=None,
212
+ pad_to_max_side=False,
213
+ mode=Image.BILINEAR,
214
+ base_pixel_number=64,
215
+ ):
216
+ if size is not None:
217
+ w_resize_new, h_resize_new = size
218
+ else:
219
+ w, h = input_image.size
220
+ # Calcular el redimensionamiento con un solo paso
221
+ ratio_min = min_side / min(w, h)
222
+ w_min, h_min = round(ratio_min * w), round(ratio_min * h)
223
+ ratio_max = max_side / max(w_min, h_min)
224
+ # Aplicar la menor de las dos ratios para asegurar que cumple ambas condiciones
225
+ final_ratio = min(ratio_min, ratio_max)
226
+ w_final, h_final = round(final_ratio * w), round(final_ratio * h)
227
+
228
+ # Ajustar al número base de píxeles más cercano
229
+ w_resize_new = (w_final // base_pixel_number) * base_pixel_number
230
+ h_resize_new = (h_final // base_pixel_number) * base_pixel_number
231
+
232
+ # Redimensionar una sola vez
233
+ input_image = input_image.resize([w_resize_new, h_resize_new], mode)
234
+
235
+ if pad_to_max_side:
236
+ # Optimizar la creación del fondo
237
+ res = Image.new("RGB", (max_side, max_side), (255, 255, 255))
238
+ offset_x = (max_side - w_resize_new) // 2
239
+ offset_y = (max_side - h_resize_new) // 2
240
+ res.paste(input_image, (offset_x, offset_y))
241
+ return res
242
+
243
+ return input_image
244
+
245
+ face_image = load_image(face_image_path)
246
+ face_image = resize_img(face_image, max_side=1024)
247
+ face_image_cv2 = convert_from_image_to_cv2(face_image)
248
+ height, width, _ = face_image_cv2.shape
249
+
250
+ # Extract face features
251
+ face_info = self.app.get(face_image_cv2)
252
+
253
+ # if len(face_info) == 0:
254
+ # raise gr.Error(
255
+ # f"Unable to detect a face in the image. Please upload a different photo with a clear face."
256
+ # )
257
+
258
+ face_info = sorted(
259
+ face_info,
260
+ key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
261
+ )[
262
+ -1
263
+ ]
264
+
265
+ face_emb = face_info["embedding"]
266
+ face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
267
+ img_controlnet = face_image
268
+
269
+ pose_image = load_image(pose_image_path)
270
+ pose_image = resize_img(pose_image, max_side=1024)
271
+ img_controlnet = pose_image
272
+ pose_image_cv2 = convert_from_image_to_cv2(pose_image)
273
+
274
+ face_info = self.app.get(pose_image_cv2)
275
+
276
+ # get error if no face is detected
277
+ # if len(face_info) == 0:
278
+ # raise gr.Error(
279
+ # f"Cannot find any face in the reference image! Please upload another person image"
280
+ # )
281
+
282
+ face_info = face_info[-1]
283
+ face_kps = draw_kps(pose_image, face_info["kps"])
284
+
285
+ width, height = face_kps.size
286
+
287
+ control_mask = np.zeros([height, width, 3])
288
+ x1, y1, x2, y2 = face_info["bbox"]
289
+ x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
290
+ control_mask[y1:y2, x1:x2] = 255
291
+ control_mask = Image.fromarray(control_mask.astype(np.uint8))
292
+
293
+ control_images = [face_kps] + [
294
+ self.controlnet_map_fn[s](img_controlnet).resize((width, height))
295
+ for s in self.my_controlnet_selection
296
+ ]
297
+
298
+ print("Start inference...")
299
+
300
+ self.generator = torch.Generator(device=device).manual_seed(42)
301
+
302
+ self.pipe.set_ip_adapter_scale(adapter_strength_ratio)
303
+ images = self.pipe(
304
+ prompt=prompt,
305
+ negative_prompt=negative_prompt,
306
+ image_embeds=face_emb,
307
+ image=control_images,
308
+ control_mask=control_mask,
309
+ controlnet_conditioning_scale=self.control_scales,
310
+ num_inference_steps=num_inference_steps,
311
+ guidance_scale=guidance_scale,
312
+ height=height,
313
+ width=width,
314
+ generator=self.generator,
315
+ ).images
316
+
317
+ return images[0]