Spaces:

HSinghHuggingFace
/

stable-diffusion-image-generator

Sleeping

App Files Files Community

stable-diffusion-image-generator / src /utils /style_generator.py

HSinghHuggingFace

Fix token embedding size mismatch

a04f2f2 10 months ago

raw

history blame contribute delete

8.14 kB

	import torch
	from diffusers import StableDiffusionPipeline
	from torch import autocast
	from pathlib import Path
	import traceback

	class StyleTransfer:
	_instance = None

	@classmethod
	def get_instance(cls):
	if cls._instance is None:
	cls._instance = cls()
	return cls._instance

	def __init__(self):
	self.pipeline = None
	self.style_tokens = []
	self.styles = [
	"dhoni",
	"mickey_mouse",
	"balloon",
	"lion_king",
	"rose_flower"
	]
	self.style_names = [
	"Dhoni Style",
	"Mickey Mouse Style",
	"Balloon Style",
	"Lion King Style",
	"Rose Flower Style"
	]
	self.is_initialized = False
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	if self.device == "cpu":
	print("NVIDIA GPU not found. Running on CPU (this will be slower)")

	def initialize_pipeline(self):
	if self.is_initialized:
	return

	try:
	print("Initializing Stable Diffusion model...")
	model_id = "runwayml/stable-diffusion-v1-5"
	self.pipeline = StableDiffusionPipeline.from_pretrained(
	model_id,
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
	safety_checker=None
	)
	self.pipeline = self.pipeline.to(self.device)

	# Load style embeddings from current directory
	current_dir = Path(__file__).parent.parent

	for style, style_name in zip(self.styles, self.style_names):
	style_path = current_dir / f"{style}.bin"
	if not style_path.exists():
	raise FileNotFoundError(f"Style embedding not found: {style_path}")

	print(f"Loading style: {style_name}")
	token = self._load_style_embedding(str(style_path))
	self.style_tokens.append(token)
	print(f"✓ Loaded style: {style_name}")

	self.is_initialized = True
	print(f"Model initialization complete! Using device: {self.device}")

	except Exception as e:
	print(f"Error during initialization: {str(e)}")
	print(traceback.format_exc())
	raise

	def _load_style_embedding(self, embedding_path, token=None):
	loaded_embeds = torch.load(embedding_path, map_location="cpu")
	trained_token = list(loaded_embeds.keys())[0]
	embeds = loaded_embeds[trained_token]

	# Get the expected dimension from the text encoder
	expected_dim = self.pipeline.text_encoder.get_input_embeddings().weight.shape[1]
	vocab_size = self.pipeline.text_encoder.get_input_embeddings().weight.shape[0]
	current_dim = embeds.shape[0]

	# Resize embeddings if dimensions don't match
	if current_dim != expected_dim:
	print(f"Resizing embedding from {current_dim} to {expected_dim}")
	if current_dim > expected_dim:
	embeds = embeds[:expected_dim]
	else:
	padding = torch.zeros(expected_dim - current_dim, device=embeds.device, dtype=embeds.dtype)
	embeds = torch.cat([embeds, padding], dim=0)

	# Reshape to match expected dimensions
	embeds = embeds.unsqueeze(0) # Add batch dimension

	# Cast to dtype of text_encoder
	dtype = self.pipeline.text_encoder.get_input_embeddings().weight.dtype
	embeds = embeds.to(dtype)

	# Add the token in tokenizer and handle embedding resize
	token = token if token is not None else trained_token
	num_added_tokens = self.pipeline.tokenizer.add_tokens(token)

	if num_added_tokens > 0:
	# Safely resize token embeddings
	self.pipeline.text_encoder.resize_token_embeddings(len(self.pipeline.tokenizer))

	# Get the id for the token and assign the embeds
	token_id = self.pipeline.tokenizer.convert_tokens_to_ids(token)
	if token_id < self.pipeline.text_encoder.get_input_embeddings().weight.shape[0]:
	self.pipeline.text_encoder.get_input_embeddings().weight.data[token_id] = embeds
	else:
	print(f"Warning: Token ID {token_id} is out of bounds. Skipping embedding assignment.")

	return token

	def generate_artwork(self, prompt, selected_style):
	try:
	# Find the index of the selected style
	style_idx = self.style_names.index(selected_style)

	# Generate single image with selected style
	styled_prompt = f"{prompt}, {self.style_tokens[style_idx]}"

	# Set seed for reproducibility
	generator_seed = 42
	torch.manual_seed(generator_seed)
	if self.device == "cuda":
	torch.cuda.manual_seed(generator_seed)

	# Generate base image
	with autocast(self.device):
	base_image = self.pipeline(
	styled_prompt,
	num_inference_steps=50,
	guidance_scale=7.5,
	generator=torch.Generator(self.device).manual_seed(generator_seed)
	).images[0]

	# Generate same image with color enhancement
	with autocast(self.device):
	enhanced_image = self.pipeline(
	styled_prompt,
	num_inference_steps=50,
	guidance_scale=7.5,
	callback=self._enhance_colors,
	callback_steps=5,
	generator=torch.Generator(self.device).manual_seed(generator_seed)
	).images[0]

	return base_image, enhanced_image

	except Exception as e:
	print(f"Error in generate_artwork: {e}")
	raise

	def _enhance_colors(self, i, t, latents):
	if i % 5 == 0: # Apply enhancement every 5 steps
	try:
	# Create a copy that requires gradients
	latents_copy = latents.detach().clone()
	latents_copy.requires_grad_(True)

	# Compute color distance loss
	loss = self._calculate_color_distance(latents_copy)

	# Compute gradients
	if loss is not None and loss.requires_grad:
	grads = torch.autograd.grad(
	outputs=loss,
	inputs=latents_copy,
	allow_unused=True,
	retain_graph=False
	)

	if grads and grads[0] is not None:
	# Apply gradients to original latents with safety checks
	grad_tensor = grads[0].detach()
	if grad_tensor.shape == latents.shape:
	return latents - 0.1 * grad_tensor

	except Exception as e:
	print(f"Error in color enhancement: {e}")
	# Continue without enhancement on error

	return latents

	def _calculate_color_distance(self, images):
	# Ensure we're working with gradients
	if not images.requires_grad:
	images = images.detach().requires_grad_(True)

	# Convert to float32 and normalize
	images = images.float() / 2 + 0.5

	# Get RGB channels
	red = images[:,0:1]
	green = images[:,1:2]
	blue = images[:,2:3]

	# Calculate color distances using L2 norm
	rg_distance = ((red - green) ** 2).mean()
	rb_distance = ((red - blue) ** 2).mean()
	gb_distance = ((green - blue) ** 2).mean()

	return (rg_distance + rb_distance + gb_distance) * 100 # Scale up the loss