import gradio as gr import torch from diffusers import StableDiffusionPipeline import os # Monkeypatch fixes for environment compatibility def apply_patches(): """Apply necessary patches for tqdm and symlinks""" import sys import shutil # 1. Fix tqdm Jupyter/Thread Error try: from tqdm.auto import tqdm if not hasattr(tqdm, '_is_patched'): import tqdm.notebook import tqdm.std tqdm.notebook.tqdm = tqdm.std.tqdm tqdm.notebook.trange = tqdm.std.trange if 'tqdm.auto' in sys.modules: sys.modules['tqdm.auto'].tqdm = tqdm.std.tqdm sys.modules['tqdm.auto'].trange = tqdm.std.trange tqdm._is_patched = True except ImportError: pass # 2. Fix Windows Symlink Permissions try: from huggingface_hub import file_download if not hasattr(file_download, '_original_create_symlink'): file_download._original_create_symlink = file_download._create_symlink def patched_create_symlink(src, dst, new_blob=False): try: file_download._original_create_symlink(src, dst, new_blob) except OSError as e: if getattr(e, 'winerror', 0) == 1314: if os.path.isdir(src): shutil.copytree(src, dst) else: shutil.copy2(src, dst) else: raise file_download._create_symlink = patched_create_symlink except ImportError: pass # Apply patches before loading models apply_patches() # Style configurations with default seeds STYLES = { "Cat Toy": { "repo": "sd-concepts-library/cat-toy", "token": "", "description": "Cute cat toy aesthetic", "default_seed": 42 }, "Seletti": { "repo": "sd-concepts-library/seletti", "token": "", "description": "Seletti design style", "default_seed": 142 }, "Madhubani Art": { "repo": "sd-concepts-library/madhubani-art", "token": "", "description": "Traditional Indian Madhubani art style", "default_seed": 242 }, "Chucky": { "repo": "sd-concepts-library/chucky", "token": "", "description": "Chucky horror character style", "default_seed": 342 }, "Indian Watercolor Portraits": { "repo": "sd-concepts-library/indian-watercolor-portraits", "token": "", "description": "Indian watercolor portrait art style", "default_seed": 442 }, "Anime Boy": { "repo": "sd-concepts-library/anime-boy", "token": "", "description": "Anime boy character style", "default_seed": 542 } } # Global pipeline variable pipe = None current_style = None def contrast_loss(images): """Calculate High-Contrast loss (maximizes variance/extremes)""" return -torch.mean((images - 0.5) ** 2) def complexity_loss(images): """Calculate Complexity loss (maximizes local detail/edges)""" diff_h = torch.abs(images[:, :, 1:, :] - images[:, :, :-1, :]) diff_v = torch.abs(images[:, :, :, 1:] - images[:, :, :, :-1]) return torch.mean(diff_h) + torch.mean(diff_v) def vibrancy_loss(images): """Calculate Vibrancy loss (maximizes color saturation/variety)""" # Maximize standard deviation across color channels # Or boost the distance from grayscale means = torch.mean(images, dim=1, keepdim=True) return -torch.mean((images - means) ** 2) def custom_sampling_loop(prompt, pipe, guidance_scale=7.5, contrast_scale=0.0, complexity_scale=0.0, vibrancy_scale=0.0, num_inference_steps=50, generator=None, num_images=1): device = pipe.device dtype = pipe.unet.dtype text_input = pipe.tokenizer([prompt] * num_images, padding="max_length", max_length=pipe.tokenizer.model_max_length, truncation=True, return_tensors="pt") text_embeddings = pipe.text_encoder(text_input.input_ids.to(device))[0] uncond_input = pipe.tokenizer([""] * num_images, padding="max_length", max_length=text_input.input_ids.shape[-1], return_tensors="pt") uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(device))[0] text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) latents = torch.randn((num_images, pipe.unet.config.in_channels, 512 // 8, 512 // 8), generator=generator, device=device, dtype=dtype) pipe.scheduler.set_timesteps(num_inference_steps) latents = latents * pipe.scheduler.init_noise_sigma from tqdm.auto import tqdm for t in tqdm(pipe.scheduler.timesteps): latent_model_input = torch.cat([latents] * 2) latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t) with torch.no_grad(): noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # COMBINED GUIDANCE GRADIENT STEP if contrast_scale > 0 or complexity_scale > 0 or vibrancy_scale > 0: latents = latents.detach().requires_grad_(True) image = pipe.vae.decode(1 / 0.18215 * latents).sample image = (image / 2 + 0.5).clamp(0, 1) loss = 0 if contrast_scale > 0: loss = loss + contrast_loss(image) * contrast_scale if complexity_scale > 0: loss = loss - complexity_loss(image) * complexity_scale if vibrancy_scale > 0: loss = loss + vibrancy_loss(image) * vibrancy_scale cond_grad = torch.autograd.grad(loss, latents)[0] latents = latents.detach() - cond_grad latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample with torch.no_grad(): image = pipe.vae.decode(1 / 0.18215 * latents).sample image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() return pipe.numpy_to_pil(image) def initialize_pipeline(): """Initialize the Stable Diffusion pipeline""" global pipe if pipe is None: print("Loading Stable Diffusion pipeline...") device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 pipe = StableDiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", torch_dtype=dtype, use_safetensors=True, safety_checker=None ).to(device) # Performance optimizations if device == "cuda": pipe.enable_attention_slicing() # Try to use xformers if available try: pipe.enable_xformers_memory_efficient_attention() print("xformers enabled") except Exception: pass print(f"Pipeline loaded on {device} with dtype {dtype}") return pipe def load_style(style_name): """Load a textual inversion style idempotently""" global current_style, pipe if pipe is None: initialize_pipeline() style_config = STYLES[style_name] token = style_config["token"] # Check if the token is already in the tokenizer to avoid ValueError if token not in pipe.tokenizer.get_vocab(): print(f"Loading style: {style_name} with token {token}") device = "cuda" if torch.cuda.is_available() else "cpu" try: # Load the inversion pipe.load_textual_inversion(style_config["repo"]) # Crucial: move back to device as load_textual_inversion # can sometimes mess with device placement of embeddings pipe.to(device) print(f"Style {style_name} loaded successfully") except Exception as e: print(f"Error loading style {style_name}: {e}") if "already in tokenizer vocabulary" in str(e): print(f"Token {token} already exists, skipping load.") else: raise e else: print(f"Style {style_name} (token {token}) already in tokenizer, skipping load.") current_style = style_name def generate_image(prompt, style_name, seed, num_inference_steps, guidance_scale, contrast_scale, complexity_scale, vibrancy_scale, num_images=3): """Generate multiple images with the selected style""" try: load_style(style_name) style_token = STYLES[style_name]["token"] final_prompt = prompt.replace("