Spaces:

wizcodes12
/

image_to_cartoonify

Running

App Files Files Community

image_to_cartoonify / app.py

wizcodes12

Update app.py

27657c5 verified 8 months ago

raw

history blame contribute delete

19.4 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	from PIL import Image
	import cv2
	import math
	import time
	import io
	import base64
	from diffusers import DDPMScheduler
	import mediapipe as mp
	import warnings
	warnings.filterwarnings('ignore')

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Using device: {device}")

	class OptimizedMediaPipeExtractor:
	def __init__(self):
	self.mp_face_mesh = mp.solutions.face_mesh
	self.face_mesh = self.mp_face_mesh.FaceMesh(
	static_image_mode=True,
	max_num_faces=1,
	refine_landmarks=True,
	min_detection_confidence=0.7,
	min_tracking_confidence=0.5
	)

	self.landmark_indices = {
	'face_outline': [10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288, 397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136, 172, 58, 132, 93, 234, 127, 162, 21, 54, 103, 67, 109],
	'left_eye': [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246],
	'right_eye': [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398],
	'left_eyebrow': [46, 53, 52, 51, 48, 115, 131, 134, 102, 49, 220, 305],
	'right_eyebrow': [276, 283, 282, 295, 285, 336, 296, 334, 293, 300, 276, 353],
	'nose': [1, 2, 5, 4, 6, 19, 94, 168, 8, 9, 10, 151, 195, 197, 196, 3],
	'lips': [61, 84, 17, 314, 405, 320, 307, 375, 321, 308, 324, 318],
	'chin': [175, 199, 428, 262, 18],
	'forehead': [9, 10, 151, 337, 299, 333, 298, 301]
	}

	def extract_features(self, image_path_or_array):
	try:
	features = self._extract_robust_features(image_path_or_array)
	return self._normalize_features(features)
	except Exception as e:
	print(f"Error in feature extraction: {e}")
	return self._get_default_features()

	def _extract_robust_features(self, image_path_or_array):
	if isinstance(image_path_or_array, str):
	image = cv2.imread(image_path_or_array)
	if image is None:
	raise ValueError(f"Failed to load image: {image_path_or_array}")
	else:
	image = image_path_or_array.copy()
	if image is None or image.size == 0:
	raise ValueError("Invalid image array provided")

	if len(image.shape) != 3 or image.shape[2] != 3:
	raise ValueError("Image must be a 3-channel color image")

	rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	h, w = rgb_image.shape[:2]

	if h < 50 or w < 50:
	raise ValueError("Image too small for feature extraction")

	results = self.face_mesh.process(rgb_image)

	if not results.multi_face_landmarks:
	raise ValueError("No face detected in image")

	landmarks = results.multi_face_landmarks[0]
	face_landmarks = np.array([[lm.x * w, lm.y * h, lm.z] for lm in landmarks.landmark])

	if len(face_landmarks) == 0:
	raise ValueError("No valid landmarks detected")

	features = torch.zeros(18, dtype=torch.float32)

	try:
	left_corner = face_landmarks[33] if len(face_landmarks) > 33 else face_landmarks[0]
	right_corner = face_landmarks[263] if len(face_landmarks) > 263 else face_landmarks[-1]
	face_width = np.max(face_landmarks[:, 0]) - np.min(face_landmarks[:, 0])
	eye_angle = abs(left_corner[1] - right_corner[1]) / (face_width + 1e-8)
	features[0] = min(max(int(eye_angle * 20), 0), 2)
	except:
	features[0] = 1

	try:
	eye_openness = self._calculate_eye_openness(face_landmarks)
	features[1] = 1 if eye_openness > 10 else 0
	except:
	features[1] = 1

	try:
	eyelid_prominence = self._calculate_eyelid_prominence(face_landmarks)
	features[2] = 1 if eyelid_prominence > 5 else 0
	except:
	features[2] = 1

	try:
	face_height = np.max(face_landmarks[:, 1]) - np.min(face_landmarks[:, 1])
	nose_tip = face_landmarks[2] if len(face_landmarks) > 2 else face_landmarks[0]
	chin_bottom = face_landmarks[18] if len(face_landmarks) > 18 else face_landmarks[-1]
	chin_length = np.linalg.norm(nose_tip[:2] - chin_bottom[:2])
	chin_length_normalized = chin_length / (face_height + 1e-8)
	features[3] = min(max(int(chin_length_normalized * 6), 0), 2)
	except:
	features[3] = 1

	for i in range(4, 18):
	try:
	if i == 4:
	features[i] = 1 if self._calculate_eyebrow_thickness(rgb_image, face_landmarks) > 0.3 else 0
	elif i == 5:
	features[i] = 7
	elif i == 6:
	features[i] = 2
	elif i == 7:
	face_width = np.max(face_landmarks[:, 0]) - np.min(face_landmarks[:, 0])
	face_height = np.max(face_landmarks[:, 1]) - np.min(face_landmarks[:, 1])
	aspect_ratio = face_width / (face_height + 1e-8)
	features[i] = min(max(int(aspect_ratio * 3.5), 0), 6)
	elif i == 8:
	features[i] = 0
	elif i == 9:
	features[i] = 55
	elif i == 10:
	features[i] = 2
	elif i == 11:
	features[i] = 5
	elif i == 12:
	features[i] = 4
	elif i == 13:
	features[i] = 0
	elif i == 14:
	features[i] = 0
	elif i == 15:
	features[i] = 1
	elif i == 16:
	features[i] = 1
	elif i == 17:
	features[i] = 1
	except:
	features[i] = 1

	return features

	def _normalize_features(self, features):
	normalized = torch.zeros_like(features, dtype=torch.float32)
	max_values = [2, 1, 1, 2, 1, 13, 3, 6, 14, 110, 4, 10, 9, 11, 6, 2, 2, 2]

	for i, max_val in enumerate(max_values):
	if max_val > 0:
	normalized[i] = torch.clamp(features[i] / max_val, 0.0, 1.0)
	else:
	normalized[i] = 0.0

	return normalized

	def _get_default_features(self):
	default_values = torch.tensor([3, 55, 7, 7, 6, 1, 1, 1, 1, 1, 2, 5, 5, 3, 1, 1, 2, 1], dtype=torch.float32)
	return self._normalize_features(default_values)

	def _calculate_eye_openness(self, landmarks):
	try:
	left_top = landmarks[159][1] if len(landmarks) > 159 else 0
	left_bottom = landmarks[145][1] if len(landmarks) > 145 else 0
	left_openness = abs(left_top - left_bottom)

	right_top = landmarks[386][1] if len(landmarks) > 386 else 0
	right_bottom = landmarks[374][1] if len(landmarks) > 374 else 0
	right_openness = abs(right_top - right_bottom)

	return (left_openness + right_openness) / 2
	except:
	return 10.0

	def _calculate_eyelid_prominence(self, landmarks):
	try:
	left_eyelid = landmarks[159][1] - landmarks[158][1] if len(landmarks) > 159 else 5
	right_eyelid = landmarks[386][1] - landmarks[385][1] if len(landmarks) > 386 else 5
	return abs(left_eyelid + right_eyelid) / 2
	except:
	return 5.0

	def _calculate_eyebrow_thickness(self, image, landmarks):
	try:
	left_brow_points = [landmarks[i] for i in self.landmark_indices['left_eyebrow'] if i < len(landmarks)]
	if len(left_brow_points) < 2:
	return 0.5

	y_coords = [p[1] for p in left_brow_points]
	thickness = (max(y_coords) - min(y_coords)) / image.shape[0]
	return min(thickness * 10, 1.0)
	except:
	return 0.5


	class OptimizedConditionedUNet(nn.Module):
	def __init__(self, in_channels=3, out_channels=3, attr_dim=18, base_channels=56):
	super().__init__()

	self.time_embed_dim = 224
	self.time_embed = nn.Sequential(
	nn.Linear(self.time_embed_dim, 448),
	nn.SiLU(),
	nn.Linear(448, 448)
	)

	self.attr_embed = nn.Sequential(
	nn.Linear(attr_dim, 112),
	nn.ReLU(),
	nn.Dropout(0.05),
	nn.Linear(112, 224),
	nn.ReLU(),
	nn.Linear(224, 448)
	)

	self.conv_in = nn.Conv2d(in_channels, base_channels, 3, padding=1)

	self.down_blocks = nn.ModuleList([
	self._make_down_block(base_channels, base_channels * 2),
	self._make_down_block(base_channels * 2, base_channels * 4),
	self._make_down_block(base_channels * 4, base_channels * 8),
	self._make_down_block(base_channels * 8, base_channels * 8)
	])

	self.mid_block = self._make_conv_block(base_channels * 8 + 448, base_channels * 8)

	self.up_blocks = nn.ModuleList([
	nn.Sequential(
	nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
	self._make_conv_block(base_channels * 8 + base_channels * 8, base_channels * 8)
	),
	nn.Sequential(
	nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
	self._make_conv_block(base_channels * 8 + base_channels * 4, base_channels * 4)
	),
	nn.Sequential(
	nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
	self._make_conv_block(base_channels * 4 + base_channels * 2, base_channels * 2)
	),
	nn.Sequential(
	nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
	self._make_conv_block(base_channels * 2 + base_channels, base_channels)
	)
	])

	self.conv_out = nn.Sequential(
	nn.GroupNorm(8, base_channels),
	nn.SiLU(),
	nn.Conv2d(base_channels, out_channels, 3, padding=1)
	)

	def _make_conv_block(self, in_ch, out_ch):
	return nn.Sequential(
	nn.Conv2d(in_ch, out_ch, 3, padding=1),
	nn.GroupNorm(min(32, max(1, out_ch//4)), out_ch),
	nn.SiLU(),
	nn.Conv2d(out_ch, out_ch, 3, padding=1),
	nn.GroupNorm(min(32, max(1, out_ch//4)), out_ch),
	nn.SiLU()
	)

	def _make_down_block(self, in_ch, out_ch):
	return nn.Sequential(
	nn.MaxPool2d(2),
	self._make_conv_block(in_ch, out_ch)
	)

	def get_time_embedding(self, timesteps):
	half_dim = self.time_embed_dim // 2
	embeddings = math.log(10000) / (half_dim - 1)
	embeddings = torch.exp(torch.arange(half_dim, device=timesteps.device) * -embeddings)
	embeddings = timesteps[:, None] * embeddings[None, :]
	embeddings = torch.cat([torch.sin(embeddings), torch.cos(embeddings)], dim=1)
	return self.time_embed(embeddings)

	def forward(self, x, timesteps, attributes):
	t_emb = self.get_time_embedding(timesteps)
	attr_emb = self.attr_embed(attributes)

	combined_emb = t_emb + attr_emb

	x = self.conv_in(x)
	skip_connections = [x]

	for down_block in self.down_blocks:
	x = down_block(x)
	skip_connections.append(x)

	attr_spatial = combined_emb.unsqueeze(-1).unsqueeze(-1)
	attr_spatial = attr_spatial.expand(-1, -1, x.shape[2], x.shape[3])
	x = torch.cat([x, attr_spatial], dim=1)

	x = self.mid_block(x)

	skip_connections = skip_connections[:-1]
	skip_connections = skip_connections[::-1]

	for i, (up_block, skip) in enumerate(zip(self.up_blocks, skip_connections)):
	x = up_block[0](x)

	if x.shape[2:] != skip.shape[2:]:
	x = F.interpolate(x, size=skip.shape[2:], mode='bilinear', align_corners=False)

	x = torch.cat([x, skip], dim=1)
	x = up_block[1](x)

	return self.conv_out(x)


	class CartoonifyProcessor:
	def __init__(self, model_path="wizcodes12/image_to_cartoonify"):
	self.device = device
	self.model = None
	self.noise_scheduler = None
	self.mp_extractor = OptimizedMediaPipeExtractor()
	self.load_model(model_path)

	def load_model(self, model_path):
	try:
	from huggingface_hub import hf_hub_download

	model_file = hf_hub_download(
	repo_id=model_path,
	filename="image_to_cartoonify.pt",
	repo_type="model"
	)

	checkpoint = torch.load(model_file, map_location=self.device)

	self.model = OptimizedConditionedUNet(
	in_channels=3,
	out_channels=3,
	attr_dim=18,
	base_channels=64
	).to(self.device)

	self.model.load_state_dict(checkpoint['model_state_dict'])
	self.model.eval()

	self.noise_scheduler = DDPMScheduler(
	num_train_timesteps=1000,
	beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	prediction_type="epsilon"
	)

	print("Model loaded successfully!")

	except Exception as e:
	print(f"Error loading model: {e}")
	raise

	def process_image(self, image):
	if image is None:
	return None

	try:
	if isinstance(image, np.ndarray):
	image = Image.fromarray(image)

	image = image.resize((256, 256))
	image_np = np.array(image)

	features = self.mp_extractor.extract_features(image_np)
	features = features.unsqueeze(0).to(self.device)

	with torch.no_grad():
	generated_image = torch.randn(1, 3, 256, 256).to(self.device)

	num_inference_steps = 25 if self.device.type == 'cuda' else 15
	self.noise_scheduler.set_timesteps(num_inference_steps)

	for i, t in enumerate(self.noise_scheduler.timesteps):
	timesteps = torch.full((1,), t, device=self.device).long()
	noise_pred = self.model(generated_image, timesteps, features)
	generated_image = self.noise_scheduler.step(noise_pred, t, generated_image).prev_sample

	generated_image = (generated_image / 2 + 0.5).clamp(0, 1)
	generated_image = generated_image.cpu().squeeze(0).permute(1, 2, 0).numpy()
	generated_image = (generated_image * 255).astype(np.uint8)

	return Image.fromarray(generated_image)

	except Exception as e:
	print(f"Error processing image: {e}")
	return None


	processor = CartoonifyProcessor()

	def create_interface():
	with gr.Blocks(title="Image to Cartoonify - wizcodes12", theme=gr.themes.Soft()) as demo:

	gr.HTML("""
	<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 10px; margin-bottom: 20px;">
	<h1 style="color: white; font-size: 2.5em; margin-bottom: 10px;">🎨 Image to Cartoonify</h1>
	<p style="color: rgba(255,255,255,0.9); font-size: 1.2em;">by wizcodes12</p>
	<p style="color: rgba(255,255,255,0.8);">Transform your photos into stunning cartoon avatars with AI</p>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📸 Upload Your Photo")
	input_image = gr.Image(
	label="Upload Image",
	type="pil",
	height=300
	)

	process_btn = gr.Button(
	"🎨 Convert to Cartoon",
	variant="primary",
	size="lg"
	)

	gr.Markdown("### 📋 Instructions")
	gr.Markdown("""
	1. Upload a clear photo with a visible face
	2. Click 'Convert to Cartoon' button
	3. Wait for AI processing (15-30 seconds)
	4. Download your cartoon avatar
	""")

	with gr.Column():
	gr.Markdown("### 🎭 Cartoon Result")
	output_image = gr.Image(
	label="Cartoon Avatar",
	type="pil",
	height=400
	)

	status = gr.Textbox(
	label="Status",
	value="Ready to process your image!",
	interactive=False
	)

	gr.HTML("""
	<div style="text-align: center; padding: 20px; margin-top: 20px; background: rgba(0,0,0,0.05); border-radius: 10px;">
	<p><strong>🚀 Powered by:</strong> Advanced AI Neural Networks</p>
	<p><strong>🎯 Features:</strong> Face Detection • Feature Extraction • Cartoon Style Transfer</p>
	<p><strong>👨‍💻 Developer:</strong> wizcodes12</p>
	</div>
	""")

	def process_image_wrapper(image):
	if image is None:
	return None, "Please upload an image first!"

	try:
	status_msg = "Processing image... Please wait."
	result = processor.process_image(image)

	if result is not None:
	return result, "Cartoon generated successfully! 🎉"
	else:
	return None, "Error: Could not process image. Please try another image."

	except Exception as e:
	return None, f"Error: {str(e)}"

	process_btn.click(
	fn=process_image_wrapper,
	inputs=[input_image],
	outputs=[output_image, status]
	)

	input_image.change(
	fn=lambda x: "Image uploaded! Click 'Convert to Cartoon' to process." if x is not None else "Please upload an image.",
	inputs=[input_image],
	outputs=[status]
	)

	return demo

	if __name__ == "__main__":
	print("🚀 Starting Image to Cartoonify by wizcodes12...")
	print(f"Device: {device}")

	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)