| import gradio as gr |
| import numpy as np |
| from PIL import Image, ImageDraw |
| from gradio_client import Client, handle_file |
| import random |
| import tempfile |
| import os |
| import logging |
| import torch |
| from diffusers import AutoencoderKL, TCDScheduler |
| from diffusers.models.model_loading_utils import load_state_dict |
| from huggingface_hub import hf_hub_download |
| from pathlib import Path |
| import torchaudio |
| from einops import rearrange |
| from scipy.io import wavfile |
| from transformers import pipeline |
|
|
| |
| from transformers import AutoModelForImageSegmentation |
| from torchvision import transforms |
| from moviepy import VideoFileClip, vfx, concatenate_videoclips, ImageSequenceClip |
| import time |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| |
| os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1" |
|
|
| |
| try: |
| import spaces |
| except: |
| |
| class spaces: |
| @staticmethod |
| def GPU(duration=None): |
| def decorator(func): |
| return func |
| return decorator |
|
|
| |
| try: |
| import mmaudio |
| except ImportError: |
| os.system("pip install -e .") |
| import mmaudio |
|
|
| from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video, |
| setup_eval_logging) |
| from mmaudio.model.flow_matching import FlowMatching |
| from mmaudio.model.networks import MMAudio, get_my_mmaudio |
| from mmaudio.model.sequence_config import SequenceConfig |
| from mmaudio.model.utils.features_utils import FeaturesUtils |
|
|
| |
| torch.set_float32_matmul_precision("medium") |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| try: |
| birefnet = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True) |
| birefnet.to(device) |
| birefnet_lite = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True) |
| birefnet_lite.to(device) |
| |
| transform_image = transforms.Compose([ |
| transforms.Resize((768, 768)), |
| transforms.ToTensor(), |
| transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
| ]) |
| |
| BIREFNET_MODEL_LOADED = True |
| except Exception as e: |
| logging.error(f"Failed to load BiRefNet models: {str(e)}") |
| BIREFNET_MODEL_LOADED = False |
|
|
| |
| try: |
| from controlnet_union import ControlNetModel_Union |
| from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline |
| |
| |
| config_file = hf_hub_download( |
| "xinsir/controlnet-union-sdxl-1.0", |
| filename="config_promax.json", |
| ) |
| |
| config = ControlNetModel_Union.load_config(config_file) |
| controlnet_model = ControlNetModel_Union.from_config(config) |
| |
| model_file = hf_hub_download( |
| "xinsir/controlnet-union-sdxl-1.0", |
| filename="diffusion_pytorch_model_promax.safetensors", |
| ) |
| state_dict = load_state_dict(model_file) |
| loaded_keys = list(state_dict.keys()) |
| |
| result = ControlNetModel_Union._load_pretrained_model( |
| controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys |
| ) |
| |
| model = result[0] |
| model = model.to(device="cuda", dtype=torch.float16) |
| |
| |
| vae = AutoencoderKL.from_pretrained( |
| "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 |
| ).to("cuda") |
| |
| |
| pipe = StableDiffusionXLFillPipeline.from_pretrained( |
| "SG161222/RealVisXL_V5.0_Lightning", |
| torch_dtype=torch.float16, |
| vae=vae, |
| controlnet=model, |
| variant="fp16", |
| ).to("cuda") |
| |
| pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) |
| |
| OUTPAINT_MODEL_LOADED = True |
| except Exception as e: |
| logging.error(f"Failed to load outpainting models: {str(e)}") |
| OUTPAINT_MODEL_LOADED = False |
|
|
| |
| if torch.cuda.is_available(): |
| dtype = torch.bfloat16 |
| else: |
| device = torch.device("cpu") |
| dtype = torch.float32 |
|
|
| |
| try: |
| model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2'] |
| model_mmaudio.download_if_needed() |
| output_dir = Path('./output/gradio') |
| setup_eval_logging() |
| |
| |
| try: |
| translator = pipeline("translation", |
| model="Helsinki-NLP/opus-mt-ko-en", |
| device="cpu", |
| use_fast=True, |
| trust_remote_code=False) |
| except Exception as e: |
| logging.warning(f"Failed to load translation model: {e}") |
| translator = None |
| |
| def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]: |
| with torch.cuda.device(device): |
| seq_cfg = model_mmaudio.seq_cfg |
| net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval() |
| net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True)) |
| logging.info(f'Loaded weights from {model_mmaudio.model_path}') |
|
|
| feature_utils = FeaturesUtils( |
| tod_vae_ckpt=model_mmaudio.vae_path, |
| synchformer_ckpt=model_mmaudio.synchformer_ckpt, |
| enable_conditions=True, |
| mode=model_mmaudio.mode, |
| bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path, |
| need_vae_encoder=False |
| ).to(device, dtype).eval() |
|
|
| return net, feature_utils, seq_cfg |
|
|
| net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model() |
| MMAUDIO_MODEL_LOADED = True |
| except Exception as e: |
| logging.error(f"Failed to load MMAudio models: {str(e)}") |
| MMAUDIO_MODEL_LOADED = False |
| translator = None |
|
|
| |
| TEXT2IMG_API_URL = "http://211.233.58.201:7896" |
| VIDEO_API_URL = "http://211.233.58.201:7875" |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
|
|
| |
| IMAGE_PRESETS = { |
| "์ปค์คํ
": {"width": 1024, "height": 1024}, |
| "1:1 ์ ์ฌ๊ฐํ": {"width": 1024, "height": 1024}, |
| "4:3 ํ์ค": {"width": 1024, "height": 768}, |
| "16:9 ์์ด๋์คํฌ๋ฆฐ": {"width": 1024, "height": 576}, |
| "9:16 ์ธ๋กํ": {"width": 576, "height": 1024}, |
| "6:19 ํน์ ์ธ๋กํ": {"width": 324, "height": 1024}, |
| "Instagram ์ ์ฌ๊ฐํ": {"width": 1080, "height": 1080}, |
| "Instagram ์คํ ๋ฆฌ": {"width": 1080, "height": 1920}, |
| "Instagram ๊ฐ๋กํ": {"width": 1080, "height": 566}, |
| "Facebook ์ปค๋ฒ": {"width": 820, "height": 312}, |
| "Twitter ํค๋": {"width": 1500, "height": 500}, |
| "YouTube ์ธ๋ค์ผ": {"width": 1280, "height": 720}, |
| "LinkedIn ๋ฐฐ๋": {"width": 1584, "height": 396}, |
| } |
|
|
| |
| def update_dimensions(preset): |
| if preset in IMAGE_PRESETS: |
| return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"] |
| return 1024, 1024 |
|
|
| def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed): |
| if not prompt: |
| return None, "ํ๋กฌํํธ๋ฅผ ์
๋ ฅํด์ฃผ์ธ์" |
| |
| try: |
| client = Client(TEXT2IMG_API_URL) |
| if seed == -1: |
| seed = random.randint(0, 9999999) |
| |
| result = client.predict( |
| prompt=prompt, |
| width=int(width), |
| height=int(height), |
| guidance=float(guidance), |
| inference_steps=int(inference_steps), |
| seed=int(seed), |
| do_img2img=False, |
| init_image=None, |
| image2image_strength=0.8, |
| resize_img=True, |
| api_name="/generate_image" |
| ) |
| return result[0], f"์ฌ์ฉ๋ ์๋: {result[1]}" |
| except Exception as e: |
| logging.error(f"Image generation error: {str(e)}") |
| return None, f"์ค๋ฅ: {str(e)}" |
|
|
| def generate_video_from_image(image, prompt="", length=4.0): |
| if image is None: |
| return None |
| |
| try: |
| |
| with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp: |
| temp_path = fp.name |
| Image.fromarray(image).save(temp_path) |
| |
| |
| client = Client(VIDEO_API_URL) |
| result = client.predict( |
| input_image=handle_file(temp_path), |
| prompt=prompt if prompt else "Generate natural motion", |
| n_prompt="", |
| seed=random.randint(0, 9999999), |
| use_teacache=True, |
| video_length=float(length), |
| api_name="/process" |
| ) |
| |
| os.unlink(temp_path) |
| |
| if result and len(result) > 0: |
| video_dict = result[0] |
| return video_dict.get("video") if isinstance(video_dict, dict) else None |
| |
| except Exception as e: |
| logging.error(f"Video generation error: {str(e)}") |
| return None |
|
|
| def prepare_image_and_mask(image, width, height, overlap_percentage, alignment): |
| """์ด๋ฏธ์ง์ ๋ง์คํฌ๋ฅผ ์ค๋นํ๋ ํจ์""" |
| if image is None: |
| return None, None |
| |
| |
| if isinstance(image, np.ndarray): |
| image = Image.fromarray(image).convert('RGB') |
| |
| target_size = (width, height) |
| |
| |
| scale_factor = min(target_size[0] / image.width, target_size[1] / image.height) |
| new_width = int(image.width * scale_factor) |
| new_height = int(image.height * scale_factor) |
| |
| |
| source = image.resize((new_width, new_height), Image.LANCZOS) |
| |
| |
| overlap_x = int(new_width * (overlap_percentage / 100)) |
| overlap_y = int(new_height * (overlap_percentage / 100)) |
| overlap_x = max(overlap_x, 1) |
| overlap_y = max(overlap_y, 1) |
| |
| |
| if alignment == "๊ฐ์ด๋ฐ": |
| margin_x = (target_size[0] - new_width) // 2 |
| margin_y = (target_size[1] - new_height) // 2 |
| elif alignment == "์ผ์ชฝ": |
| margin_x = 0 |
| margin_y = (target_size[1] - new_height) // 2 |
| elif alignment == "์ค๋ฅธ์ชฝ": |
| margin_x = target_size[0] - new_width |
| margin_y = (target_size[1] - new_height) // 2 |
| elif alignment == "์": |
| margin_x = (target_size[0] - new_width) // 2 |
| margin_y = 0 |
| elif alignment == "์๋": |
| margin_x = (target_size[0] - new_width) // 2 |
| margin_y = target_size[1] - new_height |
| |
| |
| background = Image.new('RGB', target_size, (255, 255, 255)) |
| background.paste(source, (margin_x, margin_y)) |
| |
| |
| mask = Image.new('L', target_size, 255) |
| mask_draw = ImageDraw.Draw(mask) |
| |
| |
| white_gaps_patch = 2 |
| |
| left_overlap = margin_x + overlap_x if alignment != "์ผ์ชฝ" else margin_x |
| right_overlap = margin_x + new_width - overlap_x if alignment != "์ค๋ฅธ์ชฝ" else margin_x + new_width |
| top_overlap = margin_y + overlap_y if alignment != "์" else margin_y |
| bottom_overlap = margin_y + new_height - overlap_y if alignment != "์๋" else margin_y + new_height |
| |
| mask_draw.rectangle([ |
| (left_overlap, top_overlap), |
| (right_overlap, bottom_overlap) |
| ], fill=0) |
| |
| return background, mask |
|
|
| def preview_outpaint(image, width, height, overlap_percentage, alignment): |
| """์์ํ์ธํ
๋ฏธ๋ฆฌ๋ณด๊ธฐ""" |
| background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) |
| if background is None: |
| return None |
| |
| |
| preview = background.copy().convert('RGBA') |
| |
| |
| red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64)) |
| |
| |
| red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0)) |
| red_mask.paste(red_overlay, (0, 0), mask) |
| |
| |
| preview = Image.alpha_composite(preview, red_mask) |
| |
| return preview |
|
|
| @spaces.GPU(duration=24) |
| def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8): |
| """์ด๋ฏธ์ง ์์ํ์ธํ
์คํ""" |
| if image is None: |
| return None |
| |
| if not OUTPAINT_MODEL_LOADED: |
| return Image.new('RGB', (width, height), (200, 200, 200)) |
| |
| try: |
| |
| background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) |
| if background is None: |
| return None |
| |
| |
| cnet_image = background.copy() |
| cnet_image.paste(0, (0, 0), mask) |
| |
| |
| final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k" |
| |
| |
| with torch.autocast(device_type="cuda", dtype=torch.float16): |
| ( |
| prompt_embeds, |
| negative_prompt_embeds, |
| pooled_prompt_embeds, |
| negative_pooled_prompt_embeds, |
| ) = pipe.encode_prompt(final_prompt, "cuda", True) |
| |
| |
| for generated_image in pipe( |
| prompt_embeds=prompt_embeds, |
| negative_prompt_embeds=negative_prompt_embeds, |
| pooled_prompt_embeds=pooled_prompt_embeds, |
| negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, |
| image=cnet_image, |
| num_inference_steps=num_steps |
| ): |
| |
| pass |
| |
| |
| final_image = generated_image |
| |
| |
| final_image = final_image.convert("RGBA") |
| cnet_image.paste(final_image, (0, 0), mask) |
| |
| return cnet_image |
| |
| except Exception as e: |
| logging.error(f"Outpainting error: {str(e)}") |
| return background if 'background' in locals() else None |
|
|
| |
| def translate_prompt(text): |
| try: |
| if translator is None: |
| return text |
| |
| if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text): |
| with torch.no_grad(): |
| translation = translator(text)[0]['translation_text'] |
| return translation |
| return text |
| except Exception as e: |
| logging.error(f"Translation error: {e}") |
| return text |
|
|
| @spaces.GPU |
| @torch.inference_mode() |
| def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int, |
| cfg_strength: float, duration: float): |
| if not MMAUDIO_MODEL_LOADED: |
| return None |
| |
| prompt = translate_prompt(prompt) |
| negative_prompt = translate_prompt(negative_prompt) |
|
|
| rng = torch.Generator(device=device) |
| rng.manual_seed(seed) |
| fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps) |
|
|
| clip_frames, sync_frames, duration = load_video(video, duration) |
| clip_frames = clip_frames.unsqueeze(0) |
| sync_frames = sync_frames.unsqueeze(0) |
| seq_cfg.duration = duration |
| net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len) |
|
|
| audios = generate(clip_frames, |
| sync_frames, [prompt], |
| negative_text=[negative_prompt], |
| feature_utils=feature_utils, |
| net=net_mmaudio, |
| fm=fm, |
| rng=rng, |
| cfg_strength=cfg_strength) |
| audio = audios.float().cpu()[0] |
|
|
| video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name |
| make_video(video, |
| video_save_path, |
| audio, |
| sampling_rate=seq_cfg.sampling_rate, |
| duration_sec=seq_cfg.duration) |
| return video_save_path |
|
|
| |
| def process_bg_image(image, bg, fast_mode=False): |
| """๋จ์ผ ์ด๋ฏธ์ง ๋ฐฐ๊ฒฝ ์ฒ๋ฆฌ""" |
| if not BIREFNET_MODEL_LOADED: |
| return image |
| |
| image_size = image.size |
| input_images = transform_image(image).unsqueeze(0).to(device) |
| model = birefnet_lite if fast_mode else birefnet |
| |
| with torch.no_grad(): |
| preds = model(input_images)[-1].sigmoid().cpu() |
| pred = preds[0].squeeze() |
| pred_pil = transforms.ToPILImage()(pred) |
| mask = pred_pil.resize(image_size) |
| |
| if isinstance(bg, str) and bg.startswith("#"): |
| color_rgb = tuple(int(bg[i:i+2], 16) for i in (1, 3, 5)) |
| background = Image.new("RGBA", image_size, color_rgb + (255,)) |
| elif isinstance(bg, Image.Image): |
| background = bg.convert("RGBA").resize(image_size) |
| else: |
| background = Image.open(bg).convert("RGBA").resize(image_size) |
| |
| image = Image.composite(image, background, mask) |
| return image |
|
|
| def process_video_frame(frame, bg_type, bg, fast_mode, bg_frame_index, background_frames, color): |
| """๋น๋์ค ํ๋ ์ ์ฒ๋ฆฌ""" |
| try: |
| pil_image = Image.fromarray(frame) |
| if bg_type == "์์": |
| processed_image = process_bg_image(pil_image, color, fast_mode) |
| elif bg_type == "์ด๋ฏธ์ง": |
| processed_image = process_bg_image(pil_image, bg, fast_mode) |
| elif bg_type == "๋น๋์ค": |
| background_frame = background_frames[bg_frame_index] |
| bg_frame_index += 1 |
| background_image = Image.fromarray(background_frame) |
| processed_image = process_bg_image(pil_image, background_image, fast_mode) |
| else: |
| processed_image = pil_image |
| return np.array(processed_image), bg_frame_index |
| except Exception as e: |
| print(f"Error processing frame: {e}") |
| return frame, bg_frame_index |
|
|
| @spaces.GPU |
| def process_video_bg(vid, bg_type="์์", bg_image=None, bg_video=None, color="#00FF00", |
| fps=0, video_handling="slow_down", fast_mode=True, max_workers=10): |
| """๋น๋์ค ๋ฐฐ๊ฒฝ ์ฒ๋ฆฌ ๋ฉ์ธ ํจ์""" |
| if not BIREFNET_MODEL_LOADED: |
| yield gr.update(visible=False), gr.update(visible=True), "BiRefNet ๋ชจ๋ธ์ ๋ก๋ํ์ง ๋ชปํ์ต๋๋ค." |
| yield None, None, "BiRefNet ๋ชจ๋ธ์ ๋ก๋ํ์ง ๋ชปํ์ต๋๋ค." |
| return |
| |
| try: |
| start_time = time.time() |
| video = VideoFileClip(vid) |
| if fps == 0: |
| fps = video.fps |
| |
| audio = video.audio |
| frames = list(video.iter_frames(fps=fps)) |
| |
| processed_frames = [] |
| yield gr.update(visible=True), gr.update(visible=False), f"์ฒ๋ฆฌ ์์... ๊ฒฝ๊ณผ ์๊ฐ: 0์ด" |
| |
| if bg_type == "๋น๋์ค": |
| background_video = VideoFileClip(bg_video) |
| if background_video.duration < video.duration: |
| if video_handling == "slow_down": |
| background_video = background_video.fx(vfx.speedx, factor=video.duration / background_video.duration) |
| else: |
| background_video = concatenate_videoclips([background_video] * int(video.duration / background_video.duration + 1)) |
| background_frames = list(background_video.iter_frames(fps=fps)) |
| else: |
| background_frames = None |
| |
| bg_frame_index = 0 |
|
|
| with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| futures = [executor.submit(process_video_frame, frames[i], bg_type, bg_image, fast_mode, |
| bg_frame_index + i, background_frames, color) for i in range(len(frames))] |
| for i, future in enumerate(futures): |
| result, _ = future.result() |
| processed_frames.append(result) |
| elapsed_time = time.time() - start_time |
| yield result, None, f"ํ๋ ์ {i+1}/{len(frames)} ์ฒ๋ฆฌ ์ค... ๊ฒฝ๊ณผ ์๊ฐ: {elapsed_time:.2f}์ด" |
| |
| processed_video = ImageSequenceClip(processed_frames, fps=fps) |
| processed_video = processed_video.with_audio(audio) |
| |
| with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file: |
| temp_filepath = temp_file.name |
| processed_video.write_videofile(temp_filepath, codec="libx264") |
| |
| elapsed_time = time.time() - start_time |
| yield gr.update(visible=False), gr.update(visible=True), f"์ฒ๋ฆฌ ์๋ฃ! ๊ฒฝ๊ณผ ์๊ฐ: {elapsed_time:.2f}์ด" |
| yield processed_frames[-1], temp_filepath, f"์ฒ๋ฆฌ ์๋ฃ! ๊ฒฝ๊ณผ ์๊ฐ: {elapsed_time:.2f}์ด" |
| |
| except Exception as e: |
| print(f"Error: {e}") |
| elapsed_time = time.time() - start_time |
| yield gr.update(visible=False), gr.update(visible=True), f"๋น๋์ค ์ฒ๋ฆฌ ์ค๋ฅ: {e}. ๊ฒฝ๊ณผ ์๊ฐ: {elapsed_time:.2f}์ด" |
| yield None, f"๋น๋์ค ์ฒ๋ฆฌ ์ค๋ฅ: {e}", f"๋น๋์ค ์ฒ๋ฆฌ ์ค๋ฅ: {e}. ๊ฒฝ๊ณผ ์๊ฐ: {elapsed_time:.2f}์ด" |
|
|
| |
| css = """ |
| :root { |
| --primary-color: #f8c3cd; |
| --secondary-color: #b3e5fc; |
| --background-color: #f5f5f7; |
| --card-background: #ffffff; |
| --text-color: #424242; |
| --accent-color: #ffb6c1; |
| --success-color: #c8e6c9; |
| --warning-color: #fff9c4; |
| --shadow-color: rgba(0, 0, 0, 0.1); |
| --border-radius: 12px; |
| } |
| .gradio-container { |
| max-width: 1200px !important; |
| margin: 0 auto !important; |
| } |
| .panel-box { |
| border-radius: var(--border-radius) !important; |
| box-shadow: 0 8px 16px var(--shadow-color) !important; |
| background-color: var(--card-background) !important; |
| padding: 20px !important; |
| margin-bottom: 20px !important; |
| } |
| #generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn, #bg-remove-btn { |
| background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important; |
| font-size: 1.1rem !important; |
| padding: 12px 24px !important; |
| margin-top: 10px !important; |
| width: 100% !important; |
| } |
| .tabitem { |
| min-height: 700px !important; |
| } |
| """ |
|
|
| |
| demo = gr.Blocks(css=css, title="AI ์ด๋ฏธ์ง & ๋น๋์ค & ์ค๋์ค ์์ฑ๊ธฐ") |
|
|
| with demo: |
| gr.Markdown("# ๐จ Ginigen ์คํ๋์ค") |
| |
| with gr.Tabs() as tabs: |
| |
| with gr.Tab("ํ
์คํธโ์ด๋ฏธ์งโ๋น๋์ค", elem_classes="tabitem"): |
| with gr.Row(equal_height=True): |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ ์ด๋ฏธ์ง ์์ฑ ์ค์ ") |
| |
| prompt = gr.Textbox( |
| label="ํ๋กฌํํธ(ํ๊ธ/์์ด ๊ฐ๋ฅ)", |
| placeholder="์์ฑํ๊ณ ์ถ์ ์ด๋ฏธ์ง๋ฅผ ์ค๋ช
ํ์ธ์...", |
| lines=3 |
| ) |
| |
| size_preset = gr.Dropdown( |
| choices=list(IMAGE_PRESETS.keys()), |
| value="1:1 ์ ์ฌ๊ฐํ", |
| label="ํฌ๊ธฐ ํ๋ฆฌ์
" |
| ) |
| |
| with gr.Row(): |
| width = gr.Slider(256, 2048, 1024, step=64, label="๋๋น") |
| height = gr.Slider(256, 2048, 1024, step=64, label="๋์ด") |
| |
| with gr.Row(): |
| guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="๊ฐ์ด๋์ค") |
| steps = gr.Slider(1, 50, 30, step=1, label="์คํ
") |
| |
| seed = gr.Number(label="์๋ (-1=๋๋ค)", value=-1) |
| |
| generate_btn = gr.Button("๐จ ์ด๋ฏธ์ง ์์ฑ", variant="primary", elem_id="generate-btn") |
| |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ฌ ๋น๋์ค ์์ฑ ์ค์ ") |
| |
| video_prompt = gr.Textbox( |
| label="(์ ํ) ๋น๋์ค ํ๋กฌํํธ(์์ด๋ก ์
๋ ฅ)", |
| placeholder="๋น๋์ค์ ์์ง์์ ์ค๋ช
ํ์ธ์... (๋น์๋๋ฉด ๊ธฐ๋ณธ ์์ง์ ์ ์ฉ)", |
| lines=2 |
| ) |
| |
| video_length = gr.Slider( |
| minimum=1, |
| maximum=60, |
| value=4, |
| step=0.5, |
| label="๋น๋์ค ๊ธธ์ด (์ด)", |
| info="1์ด์์ 60์ด๊น์ง ์ ํ ๊ฐ๋ฅํฉ๋๋ค" |
| ) |
| |
| video_btn = gr.Button("๐ฌ ๋น๋์ค๋ก ๋ณํ", variant="secondary", elem_id="video-btn") |
| |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ผ๏ธ ์์ฑ ๊ฒฐ๊ณผ") |
| |
| output_image = gr.Image(label="์์ฑ๋ ์ด๋ฏธ์ง", type="numpy") |
| output_seed = gr.Textbox(label="์๋ ์ ๋ณด") |
| output_video = gr.Video(label="์์ฑ๋ ๋น๋์ค") |
| |
| |
| with gr.Tab("์ด๋ฏธ์ง ๋น์จ ๋ณ๊ฒฝ/์์ฑ", elem_classes="tabitem"): |
| with gr.Row(equal_height=True): |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ผ๏ธ ์ด๋ฏธ์ง ์
๋ก๋") |
| |
| input_image = gr.Image( |
| label="์๋ณธ ์ด๋ฏธ์ง", |
| type="numpy" |
| ) |
| |
| outpaint_prompt = gr.Textbox( |
| label="ํ๋กฌํํธ (์ ํ)", |
| placeholder="ํ์ฅํ ์์ญ์ ๋ํ ์ค๋ช
...", |
| lines=2 |
| ) |
| |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### โ๏ธ ์์ํ์ธํ
์ค์ ") |
| |
| outpaint_size_preset = gr.Dropdown( |
| choices=list(IMAGE_PRESETS.keys()), |
| value="16:9 ์์ด๋์คํฌ๋ฆฐ", |
| label="๋ชฉํ ํฌ๊ธฐ ํ๋ฆฌ์
" |
| ) |
| |
| with gr.Row(): |
| outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="๋ชฉํ ๋๋น") |
| outpaint_height = gr.Slider(256, 2048, 720, step=64, label="๋ชฉํ ๋์ด") |
| |
| alignment = gr.Dropdown( |
| choices=["๊ฐ์ด๋ฐ", "์ผ์ชฝ", "์ค๋ฅธ์ชฝ", "์", "์๋"], |
| value="๊ฐ์ด๋ฐ", |
| label="์ ๋ ฌ" |
| ) |
| |
| overlap_percentage = gr.Slider( |
| minimum=1, |
| maximum=50, |
| value=10, |
| step=1, |
| label="๋ง์คํฌ ์ค๋ฒ๋ฉ (%)" |
| ) |
| |
| outpaint_steps = gr.Slider( |
| minimum=4, |
| maximum=12, |
| value=8, |
| step=1, |
| label="์ถ๋ก ์คํ
" |
| ) |
| |
| preview_btn = gr.Button("๐๏ธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ", elem_id="preview-btn") |
| outpaint_btn = gr.Button("๐จ ์์ํ์ธํ
์คํ", variant="primary", elem_id="outpaint-btn") |
| |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ผ๏ธ ๊ฒฐ๊ณผ") |
| |
| preview_image = gr.Image(label="๋ฏธ๋ฆฌ๋ณด๊ธฐ") |
| outpaint_result = gr.Image(label="์์ํ์ธํ
๊ฒฐ๊ณผ") |
| |
| |
| with gr.Tab("๋น๋์ค + ์ค๋์ค", elem_classes="tabitem"): |
| with gr.Row(equal_height=True): |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ฅ ๋น๋์ค ์
๋ก๋") |
| |
| audio_video_input = gr.Video( |
| label="์
๋ ฅ ๋น๋์ค", |
| sources=["upload"] |
| ) |
| |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ต ์ค๋์ค ์์ฑ ์ค์ ") |
| |
| audio_prompt = gr.Textbox( |
| label="ํ๋กฌํํธ (ํ๊ธ ์ง์)" if MMAUDIO_MODEL_LOADED and translator else "ํ๋กฌํํธ", |
| placeholder="์์ฑํ๊ณ ์ถ์ ์ค๋์ค๋ฅผ ์ค๋ช
ํ์ธ์... (์: ํํ๋ก์ด ํผ์๋
ธ ์์
)", |
| lines=3 |
| ) |
| |
| audio_negative_prompt = gr.Textbox( |
| label="๋ค๊ฑฐํฐ๋ธ ํ๋กฌํํธ", |
| value="music", |
| placeholder="์ํ์ง ์๋ ์์...", |
| lines=2 |
| ) |
| |
| with gr.Row(): |
| audio_seed = gr.Number(label="์๋", value=0) |
| audio_steps = gr.Number(label="์คํ
", value=25) |
| |
| with gr.Row(): |
| audio_cfg = gr.Number(label="๊ฐ์ด๋์ค ์ค์ผ์ผ", value=4.5) |
| audio_duration = gr.Number(label="์ง์์๊ฐ (์ด)", value=9999) |
| |
| audio_btn = gr.Button("๐ต ์ค๋์ค ์์ฑ ๋ฐ ํฉ์ฑ", variant="primary", elem_id="audio-btn") |
| |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ฌ ์์ฑ ๊ฒฐ๊ณผ") |
| |
| output_video_with_audio = gr.Video( |
| label="์ค๋์ค๊ฐ ์ถ๊ฐ๋ ๋น๋์ค", |
| interactive=False |
| ) |
| |
| if not MMAUDIO_MODEL_LOADED: |
| gr.Markdown("โ ๏ธ MMAudio ๋ชจ๋ธ์ ๋ก๋ํ์ง ๋ชปํ์ต๋๋ค. ์ด ๊ธฐ๋ฅ์ ์ฌ์ฉํ ์ ์์ต๋๋ค.") |
| |
| |
| with gr.Tab("๋น๋์ค ๋ฐฐ๊ฒฝ์ ๊ฑฐ/ํฉ์ฑ", elem_classes="tabitem"): |
| with gr.Row(equal_height=True): |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ฅ ๋น๋์ค ์
๋ก๋") |
| |
| bg_video_input = gr.Video( |
| label="์
๋ ฅ ๋น๋์ค", |
| interactive=True |
| ) |
| |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐จ ๋ฐฐ๊ฒฝ ์ค์ ") |
| |
| bg_type = gr.Radio( |
| ["์์", "์ด๋ฏธ์ง", "๋น๋์ค"], |
| label="๋ฐฐ๊ฒฝ ์ ํ", |
| value="์์", |
| interactive=True |
| ) |
| |
| color_picker = gr.ColorPicker( |
| label="๋ฐฐ๊ฒฝ ์์", |
| value="#00FF00", |
| visible=True, |
| interactive=True |
| ) |
| |
| bg_image_input = gr.Image( |
| label="๋ฐฐ๊ฒฝ ์ด๋ฏธ์ง", |
| type="filepath", |
| visible=False, |
| interactive=True |
| ) |
| |
| bg_video_bg = gr.Video( |
| label="๋ฐฐ๊ฒฝ ๋น๋์ค", |
| visible=False, |
| interactive=True |
| ) |
| |
| with gr.Column(visible=False) as video_handling_options: |
| video_handling_radio = gr.Radio( |
| ["slow_down", "loop"], |
| label="๋น๋์ค ์ฒ๋ฆฌ ๋ฐฉ์", |
| value="slow_down", |
| interactive=True, |
| info="slow_down: ๋ฐฐ๊ฒฝ ๋น๋์ค๋ฅผ ๋๋ฆฌ๊ฒ ์ฌ์, loop: ๋ฐฐ๊ฒฝ ๋น๋์ค๋ฅผ ๋ฐ๋ณต" |
| ) |
| |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### โ๏ธ ์ฒ๋ฆฌ ์ค์ ") |
| |
| fps_slider = gr.Slider( |
| minimum=0, |
| maximum=60, |
| step=1, |
| value=0, |
| label="์ถ๋ ฅ FPS (0 = ์๋ณธ FPS ์ ์ง)", |
| interactive=True |
| ) |
| |
| fast_mode_checkbox = gr.Checkbox( |
| label="๋น ๋ฅธ ๋ชจ๋ (BiRefNet_lite ์ฌ์ฉ)", |
| value=True, |
| interactive=True |
| ) |
| |
| max_workers_slider = gr.Slider( |
| minimum=1, |
| maximum=32, |
| step=1, |
| value=10, |
| label="์ต๋ ์์ปค ์", |
| info="๋ณ๋ ฌ๋ก ์ฒ๋ฆฌํ ํ๋ ์ ์", |
| interactive=True |
| ) |
| |
| bg_remove_btn = gr.Button("๐ฌ ๋ฐฐ๊ฒฝ ๋ณ๊ฒฝ", variant="primary", elem_id="bg-remove-btn") |
| |
| if not BIREFNET_MODEL_LOADED: |
| gr.Markdown("โ ๏ธ BiRefNet ๋ชจ๋ธ์ ๋ก๋ํ์ง ๋ชปํ์ต๋๋ค. ์ด ๊ธฐ๋ฅ์ ์ฌ์ฉํ ์ ์์ต๋๋ค.") |
| |
| |
| with gr.Column(scale=1): |
| with gr.Group(elem_classes="panel-box"): |
| gr.Markdown("### ๐ฌ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ") |
| |
| stream_image = gr.Image(label="์ค์๊ฐ ์คํธ๋ฆฌ๋ฐ", visible=False) |
| output_bg_video = gr.Video(label="์ต์ข
๋น๋์ค") |
| time_textbox = gr.Textbox(label="๊ฒฝ๊ณผ ์๊ฐ", interactive=False) |
| |
| gr.Markdown(""" |
| ### โน๏ธ ์ฌ์ฉ ๋ฐฉ๋ฒ |
| 1. ๋น๋์ค๋ฅผ ์
๋ก๋ํ์ธ์ |
| 2. ์ํ๋ ๋ฐฐ๊ฒฝ ์ ํ์ ์ ํํ์ธ์ |
| 3. ์ค์ ์ ์กฐ์ ํ๊ณ '๋ฐฐ๊ฒฝ ๋ณ๊ฒฝ' ๋ฒํผ์ ํด๋ฆญํ์ธ์ |
| |
| **์ฐธ๊ณ **: GPU ์ ํ์ผ๋ก ํ ๋ฒ์ ์ฝ 200ํ๋ ์๊น์ง ์ฒ๋ฆฌ ๊ฐ๋ฅํฉ๋๋ค. |
| ๊ธด ๋น๋์ค๋ ์์ ์กฐ๊ฐ์ผ๋ก ๋๋์ด ์ฒ๋ฆฌํ์ธ์. |
| """) |
| |
| |
| size_preset.change(update_dimensions, [size_preset], [width, height]) |
| |
| generate_btn.click( |
| generate_text_to_image, |
| [prompt, width, height, guidance, steps, seed], |
| [output_image, output_seed] |
| ) |
| |
| video_btn.click( |
| lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None, |
| [output_image, video_prompt, video_length], |
| [output_video] |
| ) |
| |
| |
| outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height]) |
| |
| preview_btn.click( |
| preview_outpaint, |
| [input_image, outpaint_width, outpaint_height, overlap_percentage, alignment], |
| [preview_image] |
| ) |
| |
| outpaint_btn.click( |
| outpaint_image, |
| [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps], |
| [outpaint_result] |
| ) |
| |
| |
| audio_btn.click( |
| video_to_audio, |
| [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration], |
| [output_video_with_audio] |
| ) |
| |
| |
| def update_bg_visibility(bg_type): |
| if bg_type == "์์": |
| return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) |
| elif bg_type == "์ด๋ฏธ์ง": |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) |
| elif bg_type == "๋น๋์ค": |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True) |
| else: |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) |
| |
| bg_type.change( |
| update_bg_visibility, |
| inputs=bg_type, |
| outputs=[color_picker, bg_image_input, bg_video_bg, video_handling_options] |
| ) |
| |
| bg_remove_btn.click( |
| process_video_bg, |
| inputs=[bg_video_input, bg_type, bg_image_input, bg_video_bg, color_picker, |
| fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider], |
| outputs=[stream_image, output_bg_video, time_textbox] |
| ) |
|
|
| demo.launch() |