| | import os |
| | import spaces |
| | import shlex |
| | import subprocess |
| | import tyro |
| | import imageio |
| | import numpy as np |
| | import tqdm |
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | import torchvision.transforms.functional as TF |
| | from safetensors.torch import load_file |
| | import rembg |
| | import gradio as gr |
| |
|
| | |
| | from huggingface_hub import hf_hub_download |
| | ckpt_path = hf_hub_download(repo_id="ashawkey/LGM", filename="model_fp16_fixrot.safetensors") |
| |
|
| | subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl")) |
| |
|
| | import kiui |
| | from kiui.op import recenter |
| | from kiui.cam import orbit_camera |
| |
|
| | from core.options import AllConfigs, Options |
| | from core.models import LGM |
| | from mvdream.pipeline_mvdream import MVDreamPipeline |
| |
|
| |
|
| | IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) |
| | IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) |
| | GRADIO_VIDEO_PATH = 'gradio_output.mp4' |
| | GRADIO_PLY_PATH = 'gradio_output.ply' |
| |
|
| | |
| | opt = Options( |
| | input_size=256, |
| | up_channels=(1024, 1024, 512, 256, 128), |
| | up_attention=(True, True, True, False, False), |
| | splat_size=128, |
| | output_size=512, |
| | batch_size=8, |
| | num_views=8, |
| | gradient_accumulation_steps=1, |
| | mixed_precision='bf16', |
| | resume=ckpt_path, |
| | ) |
| |
|
| | |
| | model = LGM(opt) |
| |
|
| | |
| | if opt.resume is not None: |
| | if opt.resume.endswith('safetensors'): |
| | ckpt = load_file(opt.resume, device='cpu') |
| | else: |
| | ckpt = torch.load(opt.resume, map_location='cpu') |
| | model.load_state_dict(ckpt, strict=False) |
| | print(f'[INFO] Loaded checkpoint from {opt.resume}') |
| | else: |
| | print(f'[WARN] model randomly initialized, are you sure?') |
| |
|
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| | model = model.half().to(device) |
| | model.eval() |
| |
|
| | tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy)) |
| | proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device) |
| | proj_matrix[0, 0] = 1 / tan_half_fov |
| | proj_matrix[1, 1] = 1 / tan_half_fov |
| | proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear) |
| | proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear) |
| | proj_matrix[2, 3] = 1 |
| |
|
| | |
| | pipe_text = MVDreamPipeline.from_pretrained( |
| | 'ashawkey/mvdream-sd2.1-diffusers', |
| | torch_dtype=torch.float16, |
| | trust_remote_code=True, |
| | |
| | ) |
| | pipe_text = pipe_text.to(device) |
| |
|
| | pipe_image = MVDreamPipeline.from_pretrained( |
| | "ashawkey/imagedream-ipmv-diffusers", |
| | torch_dtype=torch.float16, |
| | trust_remote_code=True, |
| | |
| | ) |
| | pipe_image = pipe_image.to(device) |
| |
|
| | |
| | bg_remover = rembg.new_session() |
| |
|
| | |
| | |
| | def process(input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42): |
| |
|
| | |
| | kiui.seed_everything(input_seed) |
| |
|
| | os.makedirs(opt.workspace, exist_ok=True) |
| | output_video_path = os.path.join(opt.workspace, GRADIO_VIDEO_PATH) |
| | output_ply_path = os.path.join(opt.workspace, GRADIO_PLY_PATH) |
| |
|
| | |
| | if input_image is None: |
| | mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation) |
| | mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8) |
| | |
| | mv_image = [] |
| | for i in range(4): |
| | image = rembg.remove(mv_image_uint8[i], session=bg_remover) |
| | |
| | image = image.astype(np.float32) / 255 |
| | image = recenter(image, image[..., 0] > 0, border_ratio=0.2) |
| | image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:]) |
| | mv_image.append(image) |
| | |
| | else: |
| | input_image = np.array(input_image) |
| | |
| | carved_image = rembg.remove(input_image, session=bg_remover) |
| | mask = carved_image[..., -1] > 0 |
| | image = recenter(carved_image, mask, border_ratio=0.2) |
| | image = image.astype(np.float32) / 255.0 |
| | image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4]) |
| | mv_image = pipe_image(prompt, image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation) |
| |
|
| | mv_image_grid = np.concatenate([ |
| | np.concatenate([mv_image[1], mv_image[2]], axis=1), |
| | np.concatenate([mv_image[3], mv_image[0]], axis=1), |
| | ], axis=0) |
| |
|
| | |
| | input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0) |
| | input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device) |
| | input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False) |
| | input_image = TF.normalize(input_image, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD) |
| |
|
| | rays_embeddings = model.prepare_default_rays(device, elevation=input_elevation) |
| | input_image = torch.cat([input_image, rays_embeddings], dim=1).unsqueeze(0) |
| |
|
| | with torch.no_grad(): |
| | with torch.autocast(device_type='cuda', dtype=torch.float16): |
| | |
| | gaussians = model.forward_gaussians(input_image) |
| |
|
| | |
| | model.gs.save_ply(gaussians, output_ply_path) |
| |
|
| | |
| | images = [] |
| | elevation = 0 |
| | if opt.fancy_video: |
| | azimuth = np.arange(0, 720, 4, dtype=np.int32) |
| | for azi in tqdm.tqdm(azimuth): |
| |
|
| | cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device) |
| |
|
| | cam_poses[:, :3, 1:3] *= -1 |
| |
|
| | |
| | cam_view = torch.inverse(cam_poses).transpose(1, 2) |
| | cam_view_proj = cam_view @ proj_matrix |
| | cam_pos = - cam_poses[:, :3, 3] |
| |
|
| | scale = min(azi / 360, 1) |
| |
|
| | image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=scale)['image'] |
| | images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8)) |
| | else: |
| | azimuth = np.arange(0, 360, 2, dtype=np.int32) |
| | for azi in tqdm.tqdm(azimuth): |
| |
|
| | cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device) |
| |
|
| | cam_poses[:, :3, 1:3] *= -1 |
| |
|
| | |
| | cam_view = torch.inverse(cam_poses).transpose(1, 2) |
| | cam_view_proj = cam_view @ proj_matrix |
| | cam_pos = - cam_poses[:, :3, 3] |
| |
|
| | image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=1)['image'] |
| | images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8)) |
| |
|
| | images = np.concatenate(images, axis=0) |
| | imageio.mimwrite(output_video_path, images, fps=30) |
| |
|
| | return mv_image_grid, output_video_path, output_ply_path |
| |
|
| | |
| |
|
| | _TITLE = '''LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content Creation''' |
| |
|
| | _DESCRIPTION = ''' |
| | <div> |
| | <a style="display:inline-block" href="https://me.kiui.moe/lgm/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a> |
| | <a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/LGM"><img src='https://img.shields.io/github/stars/3DTopia/LGM?style=social'/></a> |
| | </div> |
| | |
| | * Input can be only text, only image, or both image and text. |
| | * Output is a `ply` file containing the 3D Gaussians, please check our [repo](https://github.com/3DTopia/LGM/blob/main/readme.md) for visualization and mesh conversion. |
| | * If you find the output unsatisfying, try using different seeds! |
| | ''' |
| |
|
| | block = gr.Blocks(title=_TITLE).queue() |
| | with block: |
| | with gr.Row(): |
| | with gr.Column(scale=1): |
| | gr.Markdown('# ' + _TITLE) |
| | gr.Markdown(_DESCRIPTION) |
| |
|
| | with gr.Row(variant='panel'): |
| | with gr.Column(scale=1): |
| | |
| | input_image = gr.Image(label="image", type='pil') |
| | |
| | input_text = gr.Textbox(label="prompt") |
| | |
| | input_neg_text = gr.Textbox(label="negative prompt", value='ugly, blurry, pixelated obscure, unnatural colors, poor lighting, dull, unclear, cropped, lowres, low quality, artifacts, duplicate') |
| | |
| | input_elevation = gr.Slider(label="elevation", minimum=-90, maximum=90, step=1, value=0) |
| | |
| | input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=30) |
| | |
| | input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=0) |
| | |
| | button_gen = gr.Button("Generate") |
| |
|
| |
|
| | with gr.Column(scale=1): |
| | with gr.Tab("Video"): |
| | |
| | output_video = gr.Video(label="video") |
| | |
| | output_file = gr.File(label="3D Gaussians (ply format)") |
| | with gr.Tab("Multi-view Image"): |
| | |
| | output_image = gr.Image(interactive=False, show_label=False) |
| |
|
| | button_gen.click(process, inputs=[input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed], outputs=[output_image, output_video, output_file]) |
| |
|
| | gr.Examples( |
| | examples=[ |
| | "data_test/frog_sweater.jpg", |
| | "data_test/bird.jpg", |
| | "data_test/boy.jpg", |
| | "data_test/cat_statue.jpg", |
| | "data_test/dragontoy.jpg", |
| | "data_test/gso_rabbit.jpg", |
| | ], |
| | inputs=[input_image], |
| | outputs=[output_image, output_video, output_file], |
| | fn=lambda x: process(input_image=x, prompt=''), |
| | cache_examples=True, |
| | label='Image-to-3D Examples' |
| | ) |
| |
|
| | gr.Examples( |
| | examples=[ |
| | "teddy bear", |
| | "hamburger", |
| | "oldman's head sculpture", |
| | "headphone", |
| | "motorbike", |
| | "mech suit" |
| |
|
| | ], |
| | inputs=[input_text], |
| | outputs=[output_image, output_video, output_file], |
| | fn=lambda x: process(input_image=None, prompt=x), |
| | cache_examples=True, |
| | label='Text-to-3D Examples' |
| | ) |
| |
|
| | block.launch() |
| |
|