yamildiego commited on
Commit
211dca2
·
1 Parent(s): 1c76633
Files changed (4) hide show
  1. handler.py +298 -30
  2. kaifu_resize.png +3 -0
  3. pose.jpg +0 -0
  4. requirements.txt +17 -6
handler.py CHANGED
@@ -1,49 +1,317 @@
1
- from typing import List, Any
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
- from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Configurar el dispositivo para ejecutar el modelo
6
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
  if device.type != 'cuda':
8
  raise ValueError("Se requiere ejecutar en GPU")
9
 
10
- # Configurar el tipo de dato mixto basado en la capacidad de la GPU
11
- dtype = torch.bfloat16 if torch.cuda.get_device_capability(device.index)[0] >= 8 else torch.float16
12
 
13
  class EndpointHandler():
14
  def __init__(self):
15
- # Inicializar aquí si es necesario
16
- pass
17
 
18
- def __call__(self, data: Any) -> List[Any]:
19
- # Configurar el número de imágenes por prompt
20
- num_images_per_prompt = 1
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Cargar los modelos con el tipo de dato y dispositivo correctos
23
- prior = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", torch_dtype=dtype).to(device)
24
- decoder = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", torch_dtype=dtype).to(device)
25
 
26
- prompt = data.get("inputs", "Una imagen interesante") # Asegúrate de pasar un prompt adecuado
27
- negative_prompt = data.get("negative_prompt", "")
28
 
29
- prior_output = prior(
30
- prompt=prompt,
31
- height=512,
32
- width=512,
33
- negative_prompt=negative_prompt,
34
- guidance_scale=7.5,
35
- num_inference_steps=50,
36
- num_images_per_prompt=num_images_per_prompt,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- decoder_output = decoder(
40
- image_embeddings=prior_output["image_embeddings"].half(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  prompt=prompt,
42
  negative_prompt=negative_prompt,
 
 
 
 
 
43
  guidance_scale=7.5,
44
- output_type="pil",
45
- num_inference_steps=20
46
- )
 
47
 
48
- # Asumiendo que quieres retornar la primera imagen
49
- return [decoder_output.images[0]]
 
1
+ # from typing import List, Any
2
+ # import torch
3
+ # from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline
4
+
5
+ # # Configurar el dispositivo para ejecutar el modelo
6
+ # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+ # if device.type != 'cuda':
8
+ # raise ValueError("Se requiere ejecutar en GPU")
9
+
10
+ # # Configurar el tipo de dato mixto basado en la capacidad de la GPU
11
+ # dtype = torch.bfloat16 if torch.cuda.get_device_capability(device.index)[0] >= 8 else torch.float16
12
+
13
+ import cv2
14
+ import numpy as np
15
+ import PIL
16
+ from PIL import Image
17
+ import diffusers
18
+ from diffusers.models import ControlNetModel
19
+ from depth_anything.dpt import DepthAnything
20
+ from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
21
+ from diffusers.utils import load_image
22
+ from insightface.app import FaceAnalysis
23
+
24
  import torch
25
+ from pipeline_stable_diffusion_xl_instantid_full import StableDiffusionXLInstantIDPipeline, draw_kps
26
+ from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
27
+
28
+ from controlnet_aux import OpenposeDetector
29
+ from depth_anything.dpt import DepthAnything
30
+
31
+ import torch.nn.functional as F
32
+ from torchvision.transforms import Compose
33
+
34
+ from huggingface_hub import hf_hub_download
35
+
36
+ hf_hub_download(repo_id="InstantX/InstantID", filename="ControlNetModel/config.json", local_dir="./checkpoints")
37
+ hf_hub_download(
38
+ repo_id="InstantX/InstantID",
39
+ filename="ControlNetModel/diffusion_pytorch_model.safetensors",
40
+ local_dir="./checkpoints",
41
+ )
42
+ hf_hub_download(repo_id="InstantX/InstantID", filename="ip-adapter.bin", local_dir="./checkpoints")
43
 
 
44
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
45
  if device.type != 'cuda':
46
  raise ValueError("Se requiere ejecutar en GPU")
47
 
48
+ dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
 
49
 
50
  class EndpointHandler():
51
  def __init__(self):
52
+ face_adapter = f"./checkpoints/ip-adapter.bin"
53
+ controlnet_path = f"./checkpoints/ControlNetModel"
54
 
55
+ transform = Compose([
56
+ Resize(
57
+ width=518,
58
+ height=518,
59
+ resize_target=False,
60
+ keep_aspect_ratio=True,
61
+ ensure_multiple_of=14,
62
+ resize_method='lower_bound',
63
+ image_interpolation_method=cv2.INTER_CUBIC,
64
+ ),
65
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
66
+ PrepareForNet(),
67
+ ])
68
 
69
+ self.controlnet_identitynet = ControlNetModel.from_pretrained(
70
+ controlnet_path, torch_dtype=dtype
71
+ )
72
 
73
+ pretrained_model_name_or_path = "wangqixun/YamerMIX_v8"
 
74
 
75
+ self.pipe = StableDiffusionXLInstantIDPipeline.from_pretrained(
76
+ pretrained_model_name_or_path,
77
+ controlnet=[self.controlnet_identitynet],
78
+ torch_dtype=dtype,
79
+ safety_checker=None,
80
+ feature_extractor=None,
81
+ ).to(device)
82
+
83
+
84
+ self.pipe.scheduler = diffusers.EulerDiscreteScheduler.from_config(
85
+ self.pipe.scheduler.config
86
+ )
87
+
88
+ # load and disable LCM
89
+ self.pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
90
+ self.pipe.disable_lora()
91
+
92
+ self.pipe.cuda()
93
+ self.pipe.load_ip_adapter_instantid(face_adapter)
94
+ self.pipe.image_proj_model.to("cuda")
95
+ self.pipe.unet.to("cuda")
96
+
97
+
98
+ # controlnet-pose/canny/depth
99
+ controlnet_pose_model = "thibaud/controlnet-openpose-sdxl-1.0"
100
+ controlnet_canny_model = "diffusers/controlnet-canny-sdxl-1.0"
101
+ controlnet_depth_model = "diffusers/controlnet-depth-sdxl-1.0-small"
102
+
103
+ controlnet_pose = ControlNetModel.from_pretrained(
104
+ controlnet_pose_model, torch_dtype=dtype
105
+ ).to(device)
106
+ controlnet_canny = ControlNetModel.from_pretrained(
107
+ controlnet_canny_model, torch_dtype=dtype
108
+ ).to(device)
109
+ controlnet_depth = ControlNetModel.from_pretrained(
110
+ controlnet_depth_model, torch_dtype=dtype
111
+ ).to(device)
112
+
113
+ def get_canny_image(image, t1=100, t2=200):
114
+ image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
115
+ edges = cv2.Canny(image, t1, t2)
116
+ return Image.fromarray(edges, "L")
117
+
118
+ def get_depth_map(image):
119
+
120
+ image = np.array(image) / 255.0
121
+
122
+ h, w = image.shape[:2]
123
+
124
+ image = transform({'image': image})['image']
125
+ image = torch.from_numpy(image).unsqueeze(0).to("cuda")
126
+
127
+ with torch.no_grad():
128
+ depth = depth_anything(image)
129
+
130
+ depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
131
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
132
+
133
+ depth = depth.cpu().numpy().astype(np.uint8)
134
+
135
+ depth_image = Image.fromarray(depth)
136
+
137
+ return depth_image
138
+
139
+ self.controlnet_map = {
140
+ "pose": controlnet_pose,
141
+ "canny": get_canny_image,
142
+ "depth": controlnet_depth,
143
+ }
144
+
145
+ openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
146
+ depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_vitl14').to(device).eval()
147
+
148
+
149
+ self.controlnet_map_fn = {
150
+ "pose": openpose,
151
+ "canny": get_canny_image,
152
+ "depth": get_depth_map,
153
+ }
154
+
155
+ self.app = FaceAnalysis(
156
+ name="antelopev2",
157
+ root="./",
158
+ providers=["CPUExecutionProvider"],
159
  )
160
+ self.app.prepare(ctx_id=0, det_size=(640, 640))
161
+
162
+ def __call__(self):
163
+ self.pipe.scheduler = diffusers.LCMScheduler.from_config(self.pipe.scheduler.config)
164
+ self.pipe.enable_lora()
165
+
166
+ adapter_strength_ratio = 0.8
167
+ identitynet_strength_ratio = 0.8
168
+ pose_strength = 0.4
169
+ canny_strength = 0.3
170
+ depth_strength = 0.5
171
+ controlnet_selection = ["pose", "canny", "depth"]
172
+
173
+ face_image_path = "./kaifu_resize.png"
174
+ pose_image_path = "./pose.jpg"
175
+
176
+ def convert_from_cv2_to_image(img: np.ndarray) -> Image:
177
+ return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
178
+
179
+ def convert_from_image_to_cv2(img: Image) -> np.ndarray:
180
+ return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
181
+
182
+ # check if the input is valid
183
+ # if face_image_path is None:
184
+ # raise gr.Error(
185
+ # f"Cannot find any input face image! Please upload the face image"
186
+ # )
187
+ # check the prompt
188
+ # if prompt is None:
189
+ prompt = "a person"
190
+ negative_prompt=""
191
+
192
+ # apply the style template
193
+ # prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
194
+
195
+ face_image = load_image(face_image_path)
196
+ face_image = resize_img(face_image, max_side=1024)
197
+ face_image_cv2 = convert_from_image_to_cv2(face_image)
198
+ height, width, _ = face_image_cv2.shape
199
+
200
+ # Extract face features
201
+ face_info = self.app.get(face_image_cv2)
202
 
203
+ print("error si no hay face")
204
+ # if len(face_info) == 0:
205
+ # raise gr.Error(
206
+ # f"Unable to detect a face in the image. Please upload a different photo with a clear face."
207
+ # )
208
+
209
+ face_info = sorted(
210
+ face_info,
211
+ key=lambda x: (x["bbox"][2] - x["bbox"][0]) * x["bbox"][3] - x["bbox"][1],
212
+ )[
213
+ -1
214
+ ] # only use the maximum face
215
+
216
+
217
+ def resize_img(
218
+ input_image,
219
+ max_side=1280,
220
+ min_side=1024,
221
+ size=None,
222
+ pad_to_max_side=False,
223
+ mode=PIL.Image.BILINEAR,
224
+ base_pixel_number=64,
225
+ ):
226
+ w, h = input_image.size
227
+ if size is not None:
228
+ w_resize_new, h_resize_new = size
229
+ else:
230
+ ratio = min_side / min(h, w)
231
+ w, h = round(ratio * w), round(ratio * h)
232
+ ratio = max_side / max(h, w)
233
+ input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
234
+ w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
235
+ h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
236
+ input_image = input_image.resize([w_resize_new, h_resize_new], mode)
237
+
238
+ if pad_to_max_side:
239
+ res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255
240
+ offset_x = (max_side - w_resize_new) // 2
241
+ offset_y = (max_side - h_resize_new) // 2
242
+ res[
243
+ offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new
244
+ ] = np.array(input_image)
245
+ input_image = Image.fromarray(res)
246
+ return input_image
247
+
248
+ face_emb = face_info["embedding"]
249
+ face_kps = draw_kps(convert_from_cv2_to_image(face_image_cv2), face_info["kps"])
250
+ img_controlnet = face_image
251
+ if pose_image_path is not None:
252
+ pose_image = load_image(pose_image_path)
253
+ pose_image = resize_img(pose_image, max_side=1024)
254
+ img_controlnet = pose_image
255
+ pose_image_cv2 = convert_from_image_to_cv2(pose_image)
256
+
257
+ face_info = self.app.get(pose_image_cv2)
258
+
259
+ # get error if no face is detected
260
+ # if len(face_info) == 0:
261
+ # raise gr.Error(
262
+ # f"Cannot find any face in the reference image! Please upload another person image"
263
+ # )
264
+
265
+ face_info = face_info[-1]
266
+ face_kps = draw_kps(pose_image, face_info["kps"])
267
+
268
+ width, height = face_kps.size
269
+
270
+ control_mask = np.zeros([height, width, 3])
271
+ x1, y1, x2, y2 = face_info["bbox"]
272
+ x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
273
+ control_mask[y1:y2, x1:x2] = 255
274
+ control_mask = Image.fromarray(control_mask.astype(np.uint8))
275
+
276
+ if len(controlnet_selection) > 0:
277
+ controlnet_scales = {
278
+ "pose": pose_strength,
279
+ "canny": canny_strength,
280
+ "depth": depth_strength,
281
+ }
282
+ self.pipe.controlnet = MultiControlNetModel(
283
+ [self.controlnet_identitynet]
284
+ + [self.controlnet_map[s] for s in controlnet_selection]
285
+ )
286
+ control_scales = [float(identitynet_strength_ratio)] + [
287
+ controlnet_scales[s] for s in controlnet_selection
288
+ ]
289
+ control_images = [face_kps] + [
290
+ self.controlnet_map_fn[s](img_controlnet).resize((width, height))
291
+ for s in controlnet_selection
292
+ ]
293
+ else:
294
+ self.pipe.controlnet = self.controlnet_identitynet
295
+ control_scales = float(identitynet_strength_ratio)
296
+ control_images = face_kps
297
+
298
+ generator = torch.Generator(device=device.type).manual_seed(3)
299
+
300
+ print("Start inference...")
301
+
302
+ self.pipe.set_ip_adapter_scale(adapter_strength_ratio)
303
+ images = self.pipe(
304
  prompt=prompt,
305
  negative_prompt=negative_prompt,
306
+ image_embeds=face_emb,
307
+ image=control_images,
308
+ control_mask=control_mask,
309
+ controlnet_conditioning_scale=control_scales,
310
+ num_inference_steps=30,
311
  guidance_scale=7.5,
312
+ height=height,
313
+ width=width,
314
+ generator=generator,
315
+ ).images
316
 
317
+ return images[0]
 
kaifu_resize.png ADDED

Git LFS Details

  • SHA256: b7302f0f7d0ff61be67bf13d172ad2393b6cb2bc985f048089f4e901145324d7
  • Pointer size: 132 Bytes
  • Size of remote file: 1.06 MB
pose.jpg ADDED
requirements.txt CHANGED
@@ -1,7 +1,18 @@
1
- safetensors
2
- opencv-python
3
- controlnet_hinter==0.0.5
4
- git+https://github.com/kashif/diffusers.git@diffusers-yield-callback
5
- https://gradio-builds.s3.amazonaws.com/aabb08191a7d94d2a1e9ff87b0d3c3987cd519c5/gradio-4.18.0-py3-none-any.whl
6
  accelerate
7
- transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusers==0.25.1
2
+ torch==2.0.0
3
+ torchvision==0.15.1
4
+ transformers==4.37.1
 
5
  accelerate
6
+ safetensors
7
+ einops
8
+ onnxruntime-gpu
9
+ spaces==0.19.4
10
+ omegaconf
11
+ peft
12
+ huggingface-hub==0.20.2
13
+ opencv-python
14
+ insightface
15
+ gradio
16
+ controlnet_aux
17
+ gdown
18
+ peft