promptbackgroundchangedeprecated

Runtime error

App Files Files Community

HAL1993 commited on Jun 28, 2025

Commit

9a7039a

verified ·

1 Parent(s): 5c10790

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -87

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from briarmbg import BriaRMBG
 from enum import Enum
 import requests
-# Model setup (unchanged)
 sd15_name = 'stablediffusionapi/realistic-vision-v51'
 tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
 text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
@@ -23,7 +23,7 @@ vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae")
 unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")
 rmbg = BriaRMBG.from_pretrained("briaai/RMBG-1.4")
-# Change UNet (unchanged)
 with torch.no_grad():
     new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
     new_conv_in.weight.zero_()
@@ -42,27 +42,26 @@ def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
 unet.forward = hooked_unet_forward
-# Load model (unchanged)
 model_path = './models/iclight_sd15_fc.safetensors'
 sd_offset = sf.load_file(model_path)
 sd_origin = unet.state_dict()
-keys = sd_origin.keys()
 sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
 unet.load_state_dict(sd_merged, strict=True)
-del sd_offset, sd_origin, sd_merged, keys
-# Device setup (unchanged)
 device = torch.device('cuda')
 text_encoder = text_encoder.to(device=device, dtype=torch.float16)
 vae = vae.to(device=device, dtype=torch.bfloat16)
 unet = unet.to(device=device, dtype=torch.float16)
 rmbg = rmbg.to(device=device, dtype=torch.float32)
-# SDP (unchanged)
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
-# Samplers (unchanged)
 ddim_scheduler = DDIMScheduler(
     num_train_timesteps=1000,
     beta_start=0.00085,
@@ -89,7 +88,7 @@ dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler(
     steps_offset=1
 )
-# Pipelines (unchanged)
 t2i_pipe = StableDiffusionPipeline(
     vae=vae,
     text_encoder=text_encoder,
@@ -114,7 +113,7 @@ i2i_pipe = StableDiffusionImg2ImgPipeline(
     image_encoder=None
 )
-# Translation function (unchanged)
 @spaces.GPU
 def translate_albanian_to_english(text):
     if not text.strip():
@@ -132,10 +131,10 @@ def translate_albanian_to_english(text):
             return translated
         except Exception as e:
             if attempt == 1:
-                return f"Përkthimi dështoi: {str(e)}"
-    return f"Përkthimi dështoi"
-# Core processing functions (unchanged)
 @torch.inference_mode()
 def encode_prompt_inner(txt: str):
     max_length = tokenizer.model_max_length
@@ -153,7 +152,6 @@ def encode_prompt_inner(txt: str):
     token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
     conds = text_encoder(token_ids).last_hidden_state
     return conds
 @torch.inference_mode()
@@ -173,7 +171,6 @@ def encode_prompt_pair(positive_prompt, negative_prompt):
     c = torch.cat([p[None, ...] for p in c], dim=1)
     uc = torch.cat([p[None, ...] for p in uc], dim=1)
     return c, uc
 @torch.inference_mode()
@@ -231,6 +228,9 @@ def run_rmbg(img, sigma=0.0):
 @torch.inference_mode()
 def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
     bg_source = BGSource(bg_source)
     input_bg = None
@@ -253,42 +253,75 @@ def process(input_fg, prompt, image_width, image_height, num_samples, seed, step
         image = np.tile(gradient, (1, image_width))
         input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
     else:
-        raise 'Wrong initial latent!'
     rng = torch.Generator(device=device).manual_seed(int(seed))
-    fg = resize_and_center_crop(input_fg, image_width, image_height)
-    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
-    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
-    conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)
-    if input_bg is None:
-        latents = t2i_pipe(
-            prompt_embeds=conds,
-            negative_prompt_embeds=unconds,
-            width=image_width,
-            height=image_height,
-            num_inference_steps=steps,
-            num_images_per_prompt=num_samples,
-            generator=rng,
-            output_type='latent',
-            guidance_scale=cfg,
-            cross_attention_kwargs={'concat_conds': concat_conds},
-        ).images.to(vae.dtype) / vae.config.scaling_factor
-    else:
-        bg = resize_and_center_crop(input_bg, image_width, image_height)
-        bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
-        bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
         latents = i2i_pipe(
-            image=bg_latent,
-            strength=lowres_denoise,
             prompt_embeds=conds,
             negative_prompt_embeds=unconds,
             width=image_width,
             height=image_height,
-            num_inference_steps=int(round(steps / lowres_denoise)),
             num_images_per_prompt=num_samples,
             generator=rng,
             output_type='latent',
@@ -296,54 +329,28 @@ def process(input_fg, prompt, image_width, image_height, num_samples, seed, step
             cross_attention_kwargs={'concat_conds': concat_conds},
         ).images.to(vae.dtype) / vae.config.scaling_factor
-    pixels = vae.decode(latents).sample
-    pixels = pytorch2numpy(pixels)
-    pixels = [resize_without_crop(
-        image=p,
-        target_width=int(round(image_width * highres_scale / 64.0) * 64),
-        target_height=int(round(image_height * highres_scale / 64.0) * 64))
-    for p in pixels]
-    pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
-    latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
-    latents = latents.to(device=unet.device, dtype=unet.dtype)
-    image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8
-    fg = resize_and_center_crop(input_fg, image_width, image_height)
-    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
-    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
-    latents = i2i_pipe(
-        image=latents,
-        strength=highres_denoise,
-        prompt_embeds=conds,
-        negative_prompt_embeds=unconds,
-        width=image_width,
-        height=image_height,
-        num_inference_steps=int(round(steps / highres_denoise)),
-        num_images_per_prompt=num_samples,
-        generator=rng,
-        output_type='latent',
-        guidance_scale=cfg,
-        cross_attention_kwargs={'concat_conds': concat_conds},
-        ).images.to(vae.dtype) / vae.config.scaling_factor
-    pixels = vae.decode(latents).sample
-    return pytorch2numpy(pixels)
 @spaces.GPU
 @torch.inference_mode()
 def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
     # Translate Albanian prompt to English
-    prompt_english = translate_albanian_to_english(prompt)
-    if prompt_english.startswith("Përkthimi dështoi"):
-        return None, None
     input_fg, matting = run_rmbg(input_fg)
-    results = process(input_fg, prompt_english, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
-    return input_fg, results
 # Enum for background source (translated to Albanian)
 class BGSource(Enum):
@@ -402,7 +409,7 @@ def create_demo():
         """)
         gr.Markdown("# Rindriço Imazhin")
-        gr.Markdown("Rindriço imazhin duke ndryshuar sfondin bazuar në përshkrimin e dhënë")
         with gr.Row():
             with gr.Column(elem_classes="constrained-container"):
@@ -412,7 +419,7 @@ def create_demo():
                 aspect_ratio = gr.Radio(choices=["9:16", "1:1", "16:9"], value="1:1", label="Raporti i Aspektit")
                 relight_button = gr.Button(value="Rindriço")
                 result_image = gr.Image(label="Rezultati", type="numpy", height=480, width=480, elem_classes="constrained-container")
-                # Hidden components for other parameters and output_bg
                 image_width = gr.Slider(label="Gjerësia e Imazhit", minimum=256, maximum=1024, value=640, step=64, visible=False)
                 image_height = gr.Slider(label="Lartësia e Imazhit", minimum=256, maximum=1024, value=640, step=64, visible=False)
                 num_samples = gr.Slider(label="Numri i Imazheve", minimum=1, maximum=12, value=1, step=1, visible=False)
@@ -424,7 +431,6 @@ def create_demo():
                 highres_scale = gr.Slider(label="Shkalla e Rezolutës së Lartë", minimum=1.0, maximum=3.0, value=2, step=0.01, visible=False)
                 highres_denoise = gr.Slider(label="Denoise i Rezolutës së Lartë", minimum=0.1, maximum=1.0, value=0.5, step=0.01, visible=False)
                 lowres_denoise = gr.Slider(label="Denoise i Rezolutës së Ulët", minimum=0.1, maximum=1.0, value=0.9, step=0.01, visible=False)
-                output_bg = gr.Image(type="numpy", label="Parapërpunimi i Planit të Parë", visible=False)
         # Update hidden sliders based on aspect ratio
         aspect_ratio.change(
@@ -438,7 +444,7 @@ def create_demo():
             input_fg, prompt, image_width, image_height, num_samples, seed, steps,
             a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source
         ]
-        relight_button.click(fn=process_relight, inputs=ips, outputs=[output_bg, result_image])
     return block

 from enum import Enum
 import requests
+# Model setup
 sd15_name = 'stablediffusionapi/realistic-vision-v51'
 tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
 text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
 unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")
 rmbg = BriaRMBG.from_pretrained("briaai/RMBG-1.4")
+# Change UNet
 with torch.no_grad():
     new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
     new_conv_in.weight.zero_()
 unet.forward = hooked_unet_forward
+# Load model
 model_path = './models/iclight_sd15_fc.safetensors'
 sd_offset = sf.load_file(model_path)
 sd_origin = unet.state_dict()
 sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
 unet.load_state_dict(sd_merged, strict=True)
+del sd_offset, sd_origin, sd_merged
+# Device setup
 device = torch.device('cuda')
 text_encoder = text_encoder.to(device=device, dtype=torch.float16)
 vae = vae.to(device=device, dtype=torch.bfloat16)
 unet = unet.to(device=device, dtype=torch.float16)
 rmbg = rmbg.to(device=device, dtype=torch.float32)
+# SDP
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
+# Samplers
 ddim_scheduler = DDIMScheduler(
     num_train_timesteps=1000,
     beta_start=0.00085,
     steps_offset=1
 )
+# Pipelines
 t2i_pipe = StableDiffusionPipeline(
     vae=vae,
     text_encoder=text_encoder,
     image_encoder=None
 )
+# Translation function
 @spaces.GPU
 def translate_albanian_to_english(text):
     if not text.strip():
             return translated
         except Exception as e:
             if attempt == 1:
+                raise gr.Error(f"Përkthimi dështoi: {str(e)}")
+    raise gr.Error("Përkthimi dështoi. Ju lutem provoni përsëri.")
+# Core processing functions
 @torch.inference_mode()
 def encode_prompt_inner(txt: str):
     max_length = tokenizer.model_max_length
     token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
     conds = text_encoder(token_ids).last_hidden_state
     return conds
 @torch.inference_mode()
     c = torch.cat([p[None, ...] for p in c], dim=1)
     uc = torch.cat([p[None, ...] for p in uc], dim=1)
     return c, uc
 @torch.inference_mode()
 @torch.inference_mode()
 def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
+    if input_fg is None:
+        raise gr.Error("Ju lutem ngarkoni një imazh.")
     bg_source = BGSource(bg_source)
     input_bg = None
         image = np.tile(gradient, (1, image_width))
         input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
     else:
+        raise gr.Error("Preferenca e ndriçimit është e pavlefshme!")
     rng = torch.Generator(device=device).manual_seed(int(seed))
+    try:
+        fg = resize_and_center_crop(input_fg, image_width, image_height)
+        concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
+        concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
+        conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)
+        if input_bg is None:
+            latents = t2i_pipe(
+                prompt_embeds=conds,
+                negative_prompt_embeds=unconds,
+                width=image_width,
+                height=image_height,
+                num_inference_steps=steps,
+                num_images_per_prompt=num_samples,
+                generator=rng,
+                output_type='latent',
+                guidance_scale=cfg,
+                cross_attention_kwargs={'concat_conds': concat_conds},
+            ).images.to(vae.dtype) / vae.config.scaling_factor
+        else:
+            bg = resize_and_center_crop(input_bg, image_width, image_height)
+            bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
+            bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
+            latents = i2i_pipe(
+                image=bg_latent,
+                strength=lowres_denoise,
+                prompt_embeds=conds,
+                negative_prompt_embeds=unconds,
+                width=image_width,
+                height=image_height,
+                num_inference_steps=int(round(steps / lowres_denoise)),
+                num_images_per_prompt=num_samples,
+                generator=rng,
+                output_type='latent',
+                guidance_scale=cfg,
+                cross_attention_kwargs={'concat_conds': concat_conds},
+            ).images.to(vae.dtype) / vae.config.scaling_factor
+        pixels = vae.decode(latents).sample
+        pixels = pytorch2numpy(pixels)
+        pixels = [resize_without_crop(
+            image=p,
+            target_width=int(round(image_width * highres_scale / 64.0) * 64),
+            target_height=int(round(image_height * highres_scale / 64.0) * 64))
+        for p in pixels]
+        pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
+        latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
+        latents = latents.to(device=unet.device, dtype=unet.dtype)
+        image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8
+        fg = resize_and_center_crop(input_fg, image_width, image_height)
+        concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
+        concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
         latents = i2i_pipe(
+            image=latents,
+            strength=highres_denoise,
             prompt_embeds=conds,
             negative_prompt_embeds=unconds,
             width=image_width,
             height=image_height,
+            num_inference_steps=int(round(steps / highres_denoise)),
             num_images_per_prompt=num_samples,
             generator=rng,
             output_type='latent',
             cross_attention_kwargs={'concat_conds': concat_conds},
         ).images.to(vae.dtype) / vae.config.scaling_factor
+        pixels = vae.decode(latents).sample
+        results = pytorch2numpy(pixels)
+        return results[0]  # Return single image since num_samples=1
+    except Exception as e:
+        raise gr.Error(f"Gabim gjatë përpunimit të imazhit: {str(e)}")
 @spaces.GPU
 @torch.inference_mode()
 def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
+    if input_fg is None:
+        raise gr.Error("Ju lutem ngarkoni një imazh.")
     # Translate Albanian prompt to English
+    prompt_english = translate_albanian_to_english(prompt.strip()) if prompt.strip() else ""
+    # Run background removal
     input_fg, matting = run_rmbg(input_fg)
+    # Process the image
+    result = process(input_fg, prompt_english, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
+    return result
 # Enum for background source (translated to Albanian)
 class BGSource(Enum):
         """)
         gr.Markdown("# Rindriço Imazhin")
+        gr.Markdown("Rindriço imazhin duke ndryshuar ndriçimin e sfondit bazuar në përshkrimin e dhënë")
         with gr.Row():
             with gr.Column(elem_classes="constrained-container"):
                 aspect_ratio = gr.Radio(choices=["9:16", "1:1", "16:9"], value="1:1", label="Raporti i Aspektit")
                 relight_button = gr.Button(value="Rindriço")
                 result_image = gr.Image(label="Rezultati", type="numpy", height=480, width=480, elem_classes="constrained-container")
+                # Hidden components for other parameters
                 image_width = gr.Slider(label="Gjerësia e Imazhit", minimum=256, maximum=1024, value=640, step=64, visible=False)
                 image_height = gr.Slider(label="Lartësia e Imazhit", minimum=256, maximum=1024, value=640, step=64, visible=False)
                 num_samples = gr.Slider(label="Numri i Imazheve", minimum=1, maximum=12, value=1, step=1, visible=False)
                 highres_scale = gr.Slider(label="Shkalla e Rezolutës së Lartë", minimum=1.0, maximum=3.0, value=2, step=0.01, visible=False)
                 highres_denoise = gr.Slider(label="Denoise i Rezolutës së Lartë", minimum=0.1, maximum=1.0, value=0.5, step=0.01, visible=False)
                 lowres_denoise = gr.Slider(label="Denoise i Rezolutës së Ulët", minimum=0.1, maximum=1.0, value=0.9, step=0.01, visible=False)
         # Update hidden sliders based on aspect ratio
         aspect_ratio.change(
             input_fg, prompt, image_width, image_height, num_samples, seed, steps,
             a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source
         ]
+        relight_button.click(fn=process_relight, inputs=ips, outputs=result_image)
     return block