Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import numpy as np | |
| from PIL import Image | |
| import cv2 | |
| import math | |
| import time | |
| import io | |
| import base64 | |
| from diffusers import DDPMScheduler | |
| import mediapipe as mp | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Using device: {device}") | |
| class OptimizedMediaPipeExtractor: | |
| def __init__(self): | |
| self.mp_face_mesh = mp.solutions.face_mesh | |
| self.face_mesh = self.mp_face_mesh.FaceMesh( | |
| static_image_mode=True, | |
| max_num_faces=1, | |
| refine_landmarks=True, | |
| min_detection_confidence=0.7, | |
| min_tracking_confidence=0.5 | |
| ) | |
| self.landmark_indices = { | |
| 'face_outline': [10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109], | |
| 'left_eye': [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246], | |
| 'right_eye': [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398], | |
| 'left_eyebrow': [46, 53, 52, 51, 48, 115, 131, 134, 102, 49, 220, 305], | |
| 'right_eyebrow': [276, 283, 282, 295, 285, 336, 296, 334, 293, 300, 276, 353], | |
| 'nose': [1, 2, 5, 4, 6, 19, 94, 168, 8, 9, 10, 151, 195, 197, 196, 3], | |
| 'lips': [61, 84, 17, 314, 405, 320, 307, 375, 321, 308, 324, 318], | |
| 'chin': [175, 199, 428, 262, 18], | |
| 'forehead': [9, 10, 151, 337, 299, 333, 298, 301] | |
| } | |
| def extract_features(self, image_path_or_array): | |
| try: | |
| features = self._extract_robust_features(image_path_or_array) | |
| return self._normalize_features(features) | |
| except Exception as e: | |
| print(f"Error in feature extraction: {e}") | |
| return self._get_default_features() | |
| def _extract_robust_features(self, image_path_or_array): | |
| if isinstance(image_path_or_array, str): | |
| image = cv2.imread(image_path_or_array) | |
| if image is None: | |
| raise ValueError(f"Failed to load image: {image_path_or_array}") | |
| else: | |
| image = image_path_or_array.copy() | |
| if image is None or image.size == 0: | |
| raise ValueError("Invalid image array provided") | |
| if len(image.shape) != 3 or image.shape[2] != 3: | |
| raise ValueError("Image must be a 3-channel color image") | |
| rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| h, w = rgb_image.shape[:2] | |
| if h < 50 or w < 50: | |
| raise ValueError("Image too small for feature extraction") | |
| results = self.face_mesh.process(rgb_image) | |
| if not results.multi_face_landmarks: | |
| raise ValueError("No face detected in image") | |
| landmarks = results.multi_face_landmarks[0] | |
| face_landmarks = np.array([[lm.x * w, lm.y * h, lm.z] for lm in landmarks.landmark]) | |
| if len(face_landmarks) == 0: | |
| raise ValueError("No valid landmarks detected") | |
| features = torch.zeros(18, dtype=torch.float32) | |
| try: | |
| left_corner = face_landmarks[33] if len(face_landmarks) > 33 else face_landmarks[0] | |
| right_corner = face_landmarks[263] if len(face_landmarks) > 263 else face_landmarks[-1] | |
| face_width = np.max(face_landmarks[:, 0]) - np.min(face_landmarks[:, 0]) | |
| eye_angle = abs(left_corner[1] - right_corner[1]) / (face_width + 1e-8) | |
| features[0] = min(max(int(eye_angle * 20), 0), 2) | |
| except: | |
| features[0] = 1 | |
| try: | |
| eye_openness = self._calculate_eye_openness(face_landmarks) | |
| features[1] = 1 if eye_openness > 10 else 0 | |
| except: | |
| features[1] = 1 | |
| try: | |
| eyelid_prominence = self._calculate_eyelid_prominence(face_landmarks) | |
| features[2] = 1 if eyelid_prominence > 5 else 0 | |
| except: | |
| features[2] = 1 | |
| try: | |
| face_height = np.max(face_landmarks[:, 1]) - np.min(face_landmarks[:, 1]) | |
| nose_tip = face_landmarks[2] if len(face_landmarks) > 2 else face_landmarks[0] | |
| chin_bottom = face_landmarks[18] if len(face_landmarks) > 18 else face_landmarks[-1] | |
| chin_length = np.linalg.norm(nose_tip[:2] - chin_bottom[:2]) | |
| chin_length_normalized = chin_length / (face_height + 1e-8) | |
| features[3] = min(max(int(chin_length_normalized * 6), 0), 2) | |
| except: | |
| features[3] = 1 | |
| for i in range(4, 18): | |
| try: | |
| if i == 4: | |
| features[i] = 1 if self._calculate_eyebrow_thickness(rgb_image, face_landmarks) > 0.3 else 0 | |
| elif i == 5: | |
| features[i] = 7 | |
| elif i == 6: | |
| features[i] = 2 | |
| elif i == 7: | |
| face_width = np.max(face_landmarks[:, 0]) - np.min(face_landmarks[:, 0]) | |
| face_height = np.max(face_landmarks[:, 1]) - np.min(face_landmarks[:, 1]) | |
| aspect_ratio = face_width / (face_height + 1e-8) | |
| features[i] = min(max(int(aspect_ratio * 3.5), 0), 6) | |
| elif i == 8: | |
| features[i] = 0 | |
| elif i == 9: | |
| features[i] = 55 | |
| elif i == 10: | |
| features[i] = 2 | |
| elif i == 11: | |
| features[i] = 5 | |
| elif i == 12: | |
| features[i] = 4 | |
| elif i == 13: | |
| features[i] = 0 | |
| elif i == 14: | |
| features[i] = 0 | |
| elif i == 15: | |
| features[i] = 1 | |
| elif i == 16: | |
| features[i] = 1 | |
| elif i == 17: | |
| features[i] = 1 | |
| except: | |
| features[i] = 1 | |
| return features | |
| def _normalize_features(self, features): | |
| normalized = torch.zeros_like(features, dtype=torch.float32) | |
| max_values = [2, 1, 1, 2, 1, 13, 3, 6, 14, 110, 4, 10, 9, 11, 6, 2, 2, 2] | |
| for i, max_val in enumerate(max_values): | |
| if max_val > 0: | |
| normalized[i] = torch.clamp(features[i] / max_val, 0.0, 1.0) | |
| else: | |
| normalized[i] = 0.0 | |
| return normalized | |
| def _get_default_features(self): | |
| default_values = torch.tensor([3, 55, 7, 7, 6, 1, 1, 1, 1, 1, 2, 5, 5, 3, 1, 1, 2, 1], dtype=torch.float32) | |
| return self._normalize_features(default_values) | |
| def _calculate_eye_openness(self, landmarks): | |
| try: | |
| left_top = landmarks[159][1] if len(landmarks) > 159 else 0 | |
| left_bottom = landmarks[145][1] if len(landmarks) > 145 else 0 | |
| left_openness = abs(left_top - left_bottom) | |
| right_top = landmarks[386][1] if len(landmarks) > 386 else 0 | |
| right_bottom = landmarks[374][1] if len(landmarks) > 374 else 0 | |
| right_openness = abs(right_top - right_bottom) | |
| return (left_openness + right_openness) / 2 | |
| except: | |
| return 10.0 | |
| def _calculate_eyelid_prominence(self, landmarks): | |
| try: | |
| left_eyelid = landmarks[159][1] - landmarks[158][1] if len(landmarks) > 159 else 5 | |
| right_eyelid = landmarks[386][1] - landmarks[385][1] if len(landmarks) > 386 else 5 | |
| return abs(left_eyelid + right_eyelid) / 2 | |
| except: | |
| return 5.0 | |
| def _calculate_eyebrow_thickness(self, image, landmarks): | |
| try: | |
| left_brow_points = [landmarks[i] for i in self.landmark_indices['left_eyebrow'] if i < len(landmarks)] | |
| if len(left_brow_points) < 2: | |
| return 0.5 | |
| y_coords = [p[1] for p in left_brow_points] | |
| thickness = (max(y_coords) - min(y_coords)) / image.shape[0] | |
| return min(thickness * 10, 1.0) | |
| except: | |
| return 0.5 | |
| class OptimizedConditionedUNet(nn.Module): | |
| def __init__(self, in_channels=3, out_channels=3, attr_dim=18, base_channels=56): | |
| super().__init__() | |
| self.time_embed_dim = 224 | |
| self.time_embed = nn.Sequential( | |
| nn.Linear(self.time_embed_dim, 448), | |
| nn.SiLU(), | |
| nn.Linear(448, 448) | |
| ) | |
| self.attr_embed = nn.Sequential( | |
| nn.Linear(attr_dim, 112), | |
| nn.ReLU(), | |
| nn.Dropout(0.05), | |
| nn.Linear(112, 224), | |
| nn.ReLU(), | |
| nn.Linear(224, 448) | |
| ) | |
| self.conv_in = nn.Conv2d(in_channels, base_channels, 3, padding=1) | |
| self.down_blocks = nn.ModuleList([ | |
| self._make_down_block(base_channels, base_channels * 2), | |
| self._make_down_block(base_channels * 2, base_channels * 4), | |
| self._make_down_block(base_channels * 4, base_channels * 8), | |
| self._make_down_block(base_channels * 8, base_channels * 8) | |
| ]) | |
| self.mid_block = self._make_conv_block(base_channels * 8 + 448, base_channels * 8) | |
| self.up_blocks = nn.ModuleList([ | |
| nn.Sequential( | |
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), | |
| self._make_conv_block(base_channels * 8 + base_channels * 8, base_channels * 8) | |
| ), | |
| nn.Sequential( | |
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), | |
| self._make_conv_block(base_channels * 8 + base_channels * 4, base_channels * 4) | |
| ), | |
| nn.Sequential( | |
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), | |
| self._make_conv_block(base_channels * 4 + base_channels * 2, base_channels * 2) | |
| ), | |
| nn.Sequential( | |
| nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), | |
| self._make_conv_block(base_channels * 2 + base_channels, base_channels) | |
| ) | |
| ]) | |
| self.conv_out = nn.Sequential( | |
| nn.GroupNorm(8, base_channels), | |
| nn.SiLU(), | |
| nn.Conv2d(base_channels, out_channels, 3, padding=1) | |
| ) | |
| def _make_conv_block(self, in_ch, out_ch): | |
| return nn.Sequential( | |
| nn.Conv2d(in_ch, out_ch, 3, padding=1), | |
| nn.GroupNorm(min(32, max(1, out_ch//4)), out_ch), | |
| nn.SiLU(), | |
| nn.Conv2d(out_ch, out_ch, 3, padding=1), | |
| nn.GroupNorm(min(32, max(1, out_ch//4)), out_ch), | |
| nn.SiLU() | |
| ) | |
| def _make_down_block(self, in_ch, out_ch): | |
| return nn.Sequential( | |
| nn.MaxPool2d(2), | |
| self._make_conv_block(in_ch, out_ch) | |
| ) | |
| def get_time_embedding(self, timesteps): | |
| half_dim = self.time_embed_dim // 2 | |
| embeddings = math.log(10000) / (half_dim - 1) | |
| embeddings = torch.exp(torch.arange(half_dim, device=timesteps.device) * -embeddings) | |
| embeddings = timesteps[:, None] * embeddings[None, :] | |
| embeddings = torch.cat([torch.sin(embeddings), torch.cos(embeddings)], dim=1) | |
| return self.time_embed(embeddings) | |
| def forward(self, x, timesteps, attributes): | |
| t_emb = self.get_time_embedding(timesteps) | |
| attr_emb = self.attr_embed(attributes) | |
| combined_emb = t_emb + attr_emb | |
| x = self.conv_in(x) | |
| skip_connections = [x] | |
| for down_block in self.down_blocks: | |
| x = down_block(x) | |
| skip_connections.append(x) | |
| attr_spatial = combined_emb.unsqueeze(-1).unsqueeze(-1) | |
| attr_spatial = attr_spatial.expand(-1, -1, x.shape[2], x.shape[3]) | |
| x = torch.cat([x, attr_spatial], dim=1) | |
| x = self.mid_block(x) | |
| skip_connections = skip_connections[:-1] | |
| skip_connections = skip_connections[::-1] | |
| for i, (up_block, skip) in enumerate(zip(self.up_blocks, skip_connections)): | |
| x = up_block[0](x) | |
| if x.shape[2:] != skip.shape[2:]: | |
| x = F.interpolate(x, size=skip.shape[2:], mode='bilinear', align_corners=False) | |
| x = torch.cat([x, skip], dim=1) | |
| x = up_block[1](x) | |
| return self.conv_out(x) | |
| class CartoonifyProcessor: | |
| def __init__(self, model_path="wizcodes12/image_to_cartoonify"): | |
| self.device = device | |
| self.model = None | |
| self.noise_scheduler = None | |
| self.mp_extractor = OptimizedMediaPipeExtractor() | |
| self.load_model(model_path) | |
| def load_model(self, model_path): | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| model_file = hf_hub_download( | |
| repo_id=model_path, | |
| filename="image_to_cartoonify.pt", | |
| repo_type="model" | |
| ) | |
| checkpoint = torch.load(model_file, map_location=self.device) | |
| self.model = OptimizedConditionedUNet( | |
| in_channels=3, | |
| out_channels=3, | |
| attr_dim=18, | |
| base_channels=64 | |
| ).to(self.device) | |
| self.model.load_state_dict(checkpoint['model_state_dict']) | |
| self.model.eval() | |
| self.noise_scheduler = DDPMScheduler( | |
| num_train_timesteps=1000, | |
| beta_start=0.00085, | |
| beta_end=0.012, | |
| beta_schedule="scaled_linear", | |
| prediction_type="epsilon" | |
| ) | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise | |
| def process_image(self, image): | |
| if image is None: | |
| return None | |
| try: | |
| if isinstance(image, np.ndarray): | |
| image = Image.fromarray(image) | |
| image = image.resize((256, 256)) | |
| image_np = np.array(image) | |
| features = self.mp_extractor.extract_features(image_np) | |
| features = features.unsqueeze(0).to(self.device) | |
| with torch.no_grad(): | |
| generated_image = torch.randn(1, 3, 256, 256).to(self.device) | |
| num_inference_steps = 25 if self.device.type == 'cuda' else 15 | |
| self.noise_scheduler.set_timesteps(num_inference_steps) | |
| for i, t in enumerate(self.noise_scheduler.timesteps): | |
| timesteps = torch.full((1,), t, device=self.device).long() | |
| noise_pred = self.model(generated_image, timesteps, features) | |
| generated_image = self.noise_scheduler.step(noise_pred, t, generated_image).prev_sample | |
| generated_image = (generated_image / 2 + 0.5).clamp(0, 1) | |
| generated_image = generated_image.cpu().squeeze(0).permute(1, 2, 0).numpy() | |
| generated_image = (generated_image * 255).astype(np.uint8) | |
| return Image.fromarray(generated_image) | |
| except Exception as e: | |
| print(f"Error processing image: {e}") | |
| return None | |
| processor = CartoonifyProcessor() | |
| def create_interface(): | |
| with gr.Blocks(title="Image to Cartoonify - wizcodes12", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;"> | |
| <h1 style="color: white; font-size: 2.5em; margin-bottom: 10px;">π¨ Image to Cartoonify</h1> | |
| <p style="color: rgba(255,255,255,0.9); font-size: 1.2em;">by wizcodes12</p> | |
| <p style="color: rgba(255,255,255,0.8);">Transform your photos into stunning cartoon avatars with AI</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### πΈ Upload Your Photo") | |
| input_image = gr.Image( | |
| label="Upload Image", | |
| type="pil", | |
| height=300 | |
| ) | |
| process_btn = gr.Button( | |
| "π¨ Convert to Cartoon", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown("### π Instructions") | |
| gr.Markdown(""" | |
| 1. Upload a clear photo with a visible face | |
| 2. Click 'Convert to Cartoon' button | |
| 3. Wait for AI processing (15-30 seconds) | |
| 4. Download your cartoon avatar | |
| """) | |
| with gr.Column(): | |
| gr.Markdown("### π Cartoon Result") | |
| output_image = gr.Image( | |
| label="Cartoon Avatar", | |
| type="pil", | |
| height=400 | |
| ) | |
| status = gr.Textbox( | |
| label="Status", | |
| value="Ready to process your image!", | |
| interactive=False | |
| ) | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px; margin-top: 20px; background: rgba(0,0,0,0.05); border-radius: 10px;"> | |
| <p><strong>π Powered by:</strong> Advanced AI Neural Networks</p> | |
| <p><strong>π― Features:</strong> Face Detection β’ Feature Extraction β’ Cartoon Style Transfer</p> | |
| <p><strong>π¨βπ» Developer:</strong> wizcodes12</p> | |
| </div> | |
| """) | |
| def process_image_wrapper(image): | |
| if image is None: | |
| return None, "Please upload an image first!" | |
| try: | |
| status_msg = "Processing image... Please wait." | |
| result = processor.process_image(image) | |
| if result is not None: | |
| return result, "Cartoon generated successfully! π" | |
| else: | |
| return None, "Error: Could not process image. Please try another image." | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| process_btn.click( | |
| fn=process_image_wrapper, | |
| inputs=[input_image], | |
| outputs=[output_image, status] | |
| ) | |
| input_image.change( | |
| fn=lambda x: "Image uploaded! Click 'Convert to Cartoon' to process." if x is not None else "Please upload an image.", | |
| inputs=[input_image], | |
| outputs=[status] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| print("π Starting Image to Cartoonify by wizcodes12...") | |
| print(f"Device: {device}") | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) |