Spaces:

seung0h
/

cheese

Sleeping

App Files Files Community

seung0h commited on Oct 6, 2025

Commit

083928f

1 Parent(s): 620e3e1

init

Browse files

Files changed (2) hide show

app.py +42 -0
pipeline.py +211 -0

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import gradio as gr
+from pipeline import SmileGen
+import torch
+from PIL import Image
+import numpy as np
+import os
+def read_samples(path):
+    # read the samples from the path
+    samples = []
+    for filename in os.listdir(path):
+        if filename.endswith(".jpg") or filename.endswith(".png"):
+            img = Image.open(os.path.join(path, filename))
+            samples.append(np.array(img))
+    return samples
+def create_image_generation_demo():
+    # load sample images
+    image_list = []
+    model = SmileGen()
+    demo = gr.Interface(
+        fn=model.run,
+        inputs=[
+            gr.Image(label="Input Image", type="pil")
+        ],
+        outputs=[
+            gr.Image(label="Generated Image")
+        ],
+        title="Smile!",
+        description="Upload an image and generate a new image using a custom pipeline.",
+        examples=image_list
+    )
+    return demo
+# Launch the demo
+if __name__ == "__main__":
+    demo = create_image_generation_demo()
+    demo.launch()

pipeline.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from huggingface_hub import hf_hub_download
+from ultralytics import YOLO
+from supervision import Detections
+from PIL import Image
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from PIL import Image
+import pandas as pd
+import torchvision.transforms as transforms
+import torchvision
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+from diffusers import AutoPipelineForInpainting
+class SmileGen:
+    def __init__(self, device='cuda'):
+        self.device = device
+    def face_detection(self, image):
+        face_det = YOLO(hf_hub_download(repo_id="arnabdhar/YOLOv8-Face-Detection", filename="model.pt")).to(self.device)
+        face_crops = []
+        face_bboxs = []
+        output = face_det(image)
+        box_results = Detections.from_ultralytics(output[0])
+        for i, box in enumerate(box_results.xyxy):
+            x1, y1, x2, y2 = map(int, box.tolist())  # Convert coordinates to integers
+            # Crop the square by stretching small side
+            W, H = image.size
+            width = x2 - x1
+            height = y2 - y1
+            if width > height:
+                y1 -= (width - height) // 2
+                y2 += (width - height) // 2
+            else:
+                x1 -= (height - width) // 2
+                x2 += (height - width) // 2
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            x2 = min(W, x2)
+            y2 = min(H, y2)
+            box = (x1, y1, x2, y2)
+            face_crop = image.crop(box)  # Crop the region
+            face_crops.append(face_crop)
+            face_bboxs.append(box)
+        return face_crops, face_bboxs
+    def face_classification(self, face_crops, face_bboxs):
+        face_classifier = torchvision.models.efficientnet_b0(pretrained=True)
+        num_features = face_classifier.classifier[1].in_features
+        face_classifier.classifier[1] = nn.Linear(num_features, 2)
+        hf_path = hf_hub_download(
+                repo_id="seung0h/smile_classification",
+                filename="best_efficientnetB0_smile.pth",
+            )
+        best_ckpt = torch.load(hf_path)
+        face_classifier.load_state_dict(best_ckpt)
+        face_classifier.to(self.device)
+        face_classifier.eval()
+        val_transforms = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.485, 0.456, 0.406],
+                                [0.229, 0.224, 0.225])
+        ])
+        unsmile_imgs = []
+        unsmile_boxes = []
+        for i, img in enumerate(face_crops):
+            # img = Image.fromarray(img)
+            img_tensor = val_transforms(img).unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                output = face_classifier(img_tensor)
+                _, pred = torch.max(output, 1)
+                pred_label = pred.item()
+            result = "Smile" if pred_label == 1 else "Not smile"
+            if result == "Not smile":
+                unsmile_imgs.append(img)
+                unsmile_boxes.append(face_bboxs[i])
+        return unsmile_imgs, unsmile_boxes
+    def gen_mask(self, unsmile_imgs):
+        seg_processor = SegformerImageProcessor.from_pretrained("jonathandinu/face-parsing")
+        face_parser = SegformerForSemanticSegmentation.from_pretrained("jonathandinu/face-parsing").to(self.device)
+        mask_list = []
+        label_prior = {10, 11, 12}
+        for image in unsmile_imgs:
+            min_x, min_y = 1000, 1000
+            max_x, max_y = 0, 0
+            inputs = seg_processor(images=image, return_tensors="pt").to(self.device)
+            outputs = face_parser(**inputs)
+            logits = outputs.logits
+            # resize output to match input image dimensions
+            upsampled_logits = nn.functional.interpolate(logits,
+                            size=image.size[::-1], # H x W
+                            mode='bilinear',
+                            align_corners=False)
+            # get label masks
+            labels = upsampled_logits.argmax(dim=1)[0]
+            mask = np.zeros(labels.shape)
+            for i in range(labels.shape[0]):
+                for j in range(labels.shape[1]):
+                    # Check if the current label is in the predefined set
+                    if labels[i][j].item() in label_prior:
+                        # Update minimum and maximum coordinates
+                        min_x = min(min_x, i)
+                        min_y = min(min_y, j)
+                        max_x = max(max_x, i)
+                        max_y = max(max_y, j)
+            # Create a mask by setting the bounding box region to 255 (white)
+            delta = 15
+            mask[min_x-delta:max_x+delta, min_y-delta:max_y+delta] = 255
+            center_x = (min_x + max_x) // 2
+            center_y = (min_y + max_y) // 2
+            # open the center of lips for style consistency
+            hole_size = (max_y-min_y)//4
+            mask_copy = mask.copy()
+            mask_copy[:, center_y-hole_size:center_y+hole_size] = 0
+            mask_list.append({"mask":mask,
+                            "hole_mask": mask_copy})
+        return mask_list
+    def kan_inference(self, unsmile_imgs, mask_list):
+        prompt = (
+            "a young korean person, smiling softly, mouth closed or gently open, "
+            "natural lips, realistic lighting, close-up portrait, high quality, professional studio photo"
+        )
+        negative_prompt = (
+            "bad anatomy, deformed lips, extra mouth, open mouth showing teeth, "
+            "distorted face, blurry, low quality, erotic, nsfw, sexual, nude, cleavage, extra face"
+        )
+        results=[]
+        generator = torch.Generator(device=self.device).manual_seed(42)
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        ).to(self.device)
+        for img, m in zip(unsmile_imgs, mask_list):
+            mask = m["hole_mask"]
+            mask_image = Image.fromarray(mask).resize((512, 512))
+            init_image = img.resize((512, 512))
+            result = pipe(prompt=prompt,
+                      negative_prompt=negative_prompt,
+                      num_inference_steps=20,
+                      generator=generator,
+                      image=init_image,
+                      mask_image=mask_image).images[0]
+            results.append(result)
+        return results
+    def make_result(self, image_orig, results, unsmile_imgs, unsmile_boxes):
+        image_restored = image_orig.copy()
+        for i, result in enumerate(results):
+            orig_crop = unsmile_imgs[i]
+            box = unsmile_boxes[i]
+            x1, y1, x2, y2 = box
+            w, h = x2 - x1, y2 - y1
+            gen_image = results[i].resize((w, h))
+            image_restored.paste(gen_image, box=(x1, y1))
+        return image_restored
+    def run(self, image):
+        face_crops, face_bboxs = self.face_detection(image)
+        unsmile_imgs, unsmile_boxes = self.face_classification(face_crops, face_bboxs)
+        mask_list = self.gen_mask(unsmile_imgs)
+        results = self.kan_inference(unsmile_imgs, mask_list)
+        image_restored = self.make_result(image, results, unsmile_imgs, unsmile_boxes)
+        return image_restored
+if __name__ == "__main__":
+    smile_gen = SmileGen()
+    image = Image.open("samples/newjeans.jpg")
+    result = smile_gen.run(image)
+    result.show()