| import torch
|
| import numpy as np
|
| import os
|
| import sys
|
| from diffusers import EulerDiscreteScheduler
|
| from huggingface_hub import hf_hub_download
|
| from rembg import remove
|
| from PIL import Image
|
| import cv2
|
| from photomaker import PhotoMakerStableDiffusionXLPipeline
|
|
|
|
|
| styles = {
|
| "Cinematic HD": ("cinematic HD {prompt}", "low quality"),
|
| "Photographic (Default)": ("photographic {prompt}", "low quality"),
|
|
|
| }
|
|
|
|
|
| base_model_path = 'SG161222/RealVisXL_V3.0'
|
| person_image_folder = r'D:\I+D\ia\Env mixer\in'
|
| environment_image_path = r'D:\I+D\ia\Env mixer\environment10241.jpg'
|
| face_w = 512
|
| face_h = 512
|
| output_w = 512
|
| output_h = 512
|
| border=100
|
|
|
| try:
|
| if torch.cuda.is_available():
|
| device = "cuda"
|
| elif sys.platform == "darwin" and torch.backends.mps.is_available():
|
| device = "mps"
|
| else:
|
| device = "cpu"
|
| except:
|
| device = "cpu"
|
|
|
| MAX_SEED = np.iinfo(np.int32).max
|
| DEFAULT_STYLE_NAME = "Photographic (Default)"
|
|
|
|
|
| photomaker_ckpt = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
|
|
|
| if device == "mps":
|
| torch_dtype = torch.float16
|
| else:
|
| torch_dtype = torch.bfloat16
|
|
|
| pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
|
| base_model_path,
|
| torch_dtype=torch_dtype,
|
| use_safetensors=True,
|
| variant="fp16"
|
| ).to(device)
|
|
|
| pipe.load_photomaker_adapter(
|
| os.path.dirname(photomaker_ckpt),
|
| subfolder="",
|
| weight_name=os.path.basename(photomaker_ckpt),
|
| trigger_word="img"
|
| )
|
| pipe.id_encoder.to(device)
|
| pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
|
| pipe.fuse_lora()
|
|
|
| def remove_background(image_path):
|
| try:
|
| input_image = Image.open(image_path)
|
| output_image = remove(input_image)
|
| output_image.save(f"removed_bg_{os.path.basename(image_path)}.png")
|
| return output_image
|
| except Exception as e:
|
| print(f"Error in remove_background: {e}")
|
| return None
|
|
|
| def detect_face(image, image_path):
|
| gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
|
| face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
| faces = face_cascade.detectMultiScale(gray, 1.1, 4)
|
| for (x, y, w, h) in faces:
|
| cv2.rectangle(np.array(image), (x, y), (x+w, y+h), (255, 0, 0), 2)
|
| image.save(f"faces_detected_{os.path.basename(image_path)}.png")
|
| return faces
|
|
|
| def crop_and_resize_face(image, face):
|
| x, y, w, h = face
|
| face_img = image.crop((x-border, y-border, x + w+border, y + h+border))
|
|
|
|
|
| background = Image.new('RGBA', (face_w, face_h), (0, 0, 0, 0))
|
|
|
|
|
| face_img.thumbnail((face_w, face_h), Image.Resampling.LANCZOS)
|
|
|
|
|
| x_offset = (background.width - face_img.width) // 2
|
| y_offset = (background.height - face_img.height) // 2
|
|
|
|
|
| background.paste(face_img, (x_offset, y_offset), face_img)
|
|
|
| return background
|
|
|
| def apply_style(style_name: str, positive: str, negative: str = ""):
|
| p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
|
| return p.replace("{prompt}", positive), n + ' ' + negative
|
|
|
| def process_image(image_path):
|
| return remove_background(image_path)
|
|
|
| def main():
|
| prompt = "cinematic photo of a person img sniffing cocaine, 35mm photograph, film, bokeh, professional, 4k, highly detailed,"
|
| negative_prompt = "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
|
| num_steps =25
|
| style_strength_ratio = 4
|
| num_outputs = 1
|
| guidance_scale = 5
|
| seed = 1700
|
| start_merge_step = 2
|
| style_name = "Cinematic HD"
|
|
|
|
|
| image_token_id = pipe.tokenizer.convert_tokens_to_ids(pipe.trigger_word)
|
| input_ids = pipe.tokenizer.encode(prompt)
|
| if image_token_id not in input_ids:
|
| raise ValueError(f"Cannot find the trigger word '{pipe.trigger_word}' in text prompt!")
|
|
|
| if input_ids.count(image_token_id) > 1:
|
| raise ValueError(f"Cannot use multiple trigger words '{pipe.trigger_word}' in text prompt!")
|
|
|
|
|
| if negative_prompt:
|
| negative_prompt_ids = pipe.tokenizer.encode(negative_prompt)
|
| if image_token_id in negative_prompt_ids:
|
| raise ValueError(f"Cannot use trigger word '{pipe.trigger_word}' in negative prompt!")
|
|
|
|
|
| styled_prompt, styled_negative_prompt = apply_style(style_name, prompt, negative_prompt)
|
|
|
|
|
| environment_image = None
|
| if os.path.exists(environment_image_path):
|
| try:
|
| environment_image = Image.open(environment_image_path)
|
| except Exception as e:
|
| print(f"Error loading environment image: {e}")
|
|
|
|
|
| try:
|
| person_image_paths = [os.path.join(person_image_folder, filename) for filename in os.listdir(person_image_folder) if filename.lower().endswith(('.png', '.jpg', '.jpeg'))]
|
| person_images = [process_image(image_path) for image_path in person_image_paths]
|
| person_images = [img for img in person_images if img is not None]
|
| except Exception as e:
|
| print(f"Error processing person images: {e}")
|
| return
|
|
|
|
|
| processed_faces = []
|
| for img, img_path in zip(person_images, person_image_paths):
|
| faces = detect_face(img, img_path)
|
| if len(faces) > 0:
|
| face_image = crop_and_resize_face(img, faces[0])
|
| processed_faces.append(face_image)
|
| else:
|
| print(f"No face detected in {img_path}, skipping this image")
|
|
|
|
|
| face_tensors = []
|
| for face_img in processed_faces:
|
| print(f"Face Image Size: {face_img.size}")
|
| face_tensor = torch.tensor(np.array(face_img.convert("RGB"))).permute(2, 0, 1).to(device, dtype=torch_dtype)
|
| face_tensors.append(face_tensor)
|
|
|
| if not face_tensors:
|
| print("No faces processed")
|
| return
|
|
|
|
|
| conditioning_tensor = torch.stack(face_tensors).unsqueeze(0)
|
|
|
|
|
| print(f"Prompt: {styled_prompt}")
|
| print(f"Negative Prompt: {styled_negative_prompt}")
|
| print(f"Output Width: {output_w}")
|
| print(f"Output Height: {output_h}")
|
| print(f"Number of Outputs: {num_outputs}")
|
| print(f"Number of Inference Steps: {num_steps}")
|
| print(f"Start Merge Step: {start_merge_step}")
|
| print(f"Guidance Scale: {guidance_scale}")
|
| print(f"Device: {device}")
|
| print(f"Number of Processed Faces: {len(processed_faces)}")
|
| print(f"Conditioning Tensor Shape: {conditioning_tensor.shape}")
|
|
|
|
|
| generator = torch.manual_seed(seed)
|
| try:
|
| result = pipe(
|
| prompt=styled_prompt,
|
| width=output_w,
|
| height=output_h,
|
| input_id_images=processed_faces,
|
| negative_prompt=styled_negative_prompt,
|
| num_images_per_prompt=num_outputs,
|
| num_inference_steps=num_steps,
|
| start_merge_step=start_merge_step,
|
| generator=generator,
|
| guidance_scale=guidance_scale,
|
| ).images
|
|
|
| if result:
|
| final_image = result[0]
|
| final_image.save("result.png")
|
| print("Image saved as result.png")
|
| else:
|
| print("No image generated")
|
| except Exception as e:
|
| print(f"Error during image generation: {e}")
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|