ReconViaGen

Paused

App Files Files Community

notenoughram commited on Dec 20, 2025

Commit

8f5dd0d

verified ·

1 Parent(s): 9faeb08

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -94

app.py CHANGED Viewed

@@ -1,68 +1,68 @@
 import gradio as gr
-import spaces
-# 유료 환경에서는 spaces import가 있어도 @spaces.GPU 데코레이터만 안 쓰면 됩니다.
 from gradio_litmodel3d import LitModel3D
-import os
-import shutil
 os.environ['SPCONV_ALGO'] = 'native'
 from typing import *
 import torch
-import numpy as np
 import imageio
 from easydict import EasyDict as edict
 from PIL import Image
 from trellis.pipelines import TrellisVGGTTo3DPipeline
 from trellis.representations import Gaussian, MeshExtractResult
 from trellis.utils import render_utils, postprocessing_utils
 from wheels.vggt.vggt.utils.load_fn import load_and_preprocess_images
 from wheels.vggt.vggt.utils.pose_enc import pose_encoding_to_extri_intri
-import open3d as o3d
-from torchvision import transforms as TF
-from PIL import Image
-import sys
-# sys.path.append("wheels") # 필요시 경로 수정
-import cv2 # cv2가 누락되어 있을 수 있어 추가했습니다.
 from wheels.mast3r.model import AsymmetricMASt3R
 from wheels.mast3r.fast_nn import fast_reciprocal_NNs
 from wheels.dust3r.dust3r.inference import inference
 from wheels.dust3r.dust3r.utils.image import load_images_new
-from trellis.utils.general_utils import *
-import copy
 MAX_SEED = np.iinfo(np.int32).max
 TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
-# TMP_DIR = "tmp/Trellis-demo"
-# os.environ['GRADIO_TEMP_DIR'] = 'tmp'
 os.makedirs(TMP_DIR, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def start_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
-    # [수정] 폴더가 없으면 삭제하지 않도록 예외 처리 추가
     if os.path.exists(user_dir):
         shutil.rmtree(user_dir)
-# [수정] 유료 4 GPU 사용 시 충돌 방지를 위해 @spaces.GPU 제거
 def preprocess_image(image: Image.Image) -> Image.Image:
-    """
-    Preprocess the input image for 3D generation.
-    """
     processed_image = pipeline.preprocess_image(image)
     return processed_image
-# [수정] @spaces.GPU 제거
 def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
-    """
-    Preprocess the input video for multi-image 3D generation.
-    """
     vid = imageio.get_reader(video, 'ffmpeg')
     fps = vid.get_meta_data()['fps']
     images = []
@@ -76,16 +76,12 @@ def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
-# [수정] @spaces.GPU 제거
 def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
-    """
-    Preprocess a list of input images for multi-image 3D generation.
-    """
     images = [image[0] for image in images]
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
 def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
     return {
         'gaussian': {
@@ -101,8 +97,7 @@ def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
             'faces': mesh.faces.cpu().numpy(),
         },
     }
 def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
     gs = Gaussian(
         aabb=state['gaussian']['aabb'],
@@ -112,7 +107,7 @@ def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
         opacity_bias=state['gaussian']['opacity_bias'],
         scaling_activation=state['gaussian']['scaling_activation'],
     )
-    # [수정] 데이터를 로드할 때 메인 GPU(cuda:0)으로 보냄
     gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda:0')
     gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda:0')
     gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda:0')
@@ -123,18 +118,17 @@ def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
         vertices=torch.tensor(state['mesh']['vertices'], device='cuda:0'),
         faces=torch.tensor(state['mesh']['faces'], device='cuda:0'),
     )
     return gs, mesh
 def get_seed(randomize_seed: bool, seed: int) -> int:
-    """
-    Get the random seed for generation.
-    """
     return np.random.randint(0, MAX_SEED) if randomize_seed else seed
-def align_camera(num_frames, extrinsic, intrinsic, rend_extrinsics, rend_intrinsics):
     extrinsic_tmp = extrinsic.clone()
     camera_relative = torch.matmul(extrinsic_tmp[:num_frames,:3,:3].permute(0,2,1), extrinsic_tmp[num_frames:,:3,:3])
     camera_relative_angle = torch.acos(((camera_relative[:,0,0] + camera_relative[:,1,1] + camera_relative[:,2,2] - 1) / 2).clamp(-1, 1))
@@ -155,20 +149,17 @@ def align_camera(num_frames, extrinsic, intrinsic, rend_extrinsics, rend_intrins
 def refine_pose_mast3r(rend_image_pil, target_image_pil, original_size, fxy, target_extrinsic, rend_depth):
     images_mast3r = load_images_new([rend_image_pil, target_image_pil], size=512, square_ok=True)
     with torch.no_grad():
-        # [수정] mast3r_model 추론 시 cuda:0 명시 (또는 할당된 device)
         output = inference([tuple(images_mast3r)], mast3r_model, "cuda:0", batch_size=1, verbose=False)
     view1, pred1 = output['view1'], output['pred1']
     view2, pred2 = output['view2'], output['pred2']
     del output
     desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
-    # find 2D-2D matches between the two images
     matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
                                                 device="cuda:0", dist='dot', block_size=2**13)
-    # ignore small border around the edge
     H0, W0 = view1['true_shape'][0]
     valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
         matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
@@ -187,7 +178,7 @@ def refine_pose_mast3r(rend_image_pil, target_image_pil, original_size, fxy, tar
         pixel[0] *= scale_x
         pixel[1] *= scale_y
     depth_map = rend_depth[0]
-    fx, fy, cx, cy = fxy.item(), fxy.item(), original_size[1]/2, original_size[0]/2  # Example values for focal lengths and principal point
     K = np.array([
         [fx, 0, cx],
         [0, fy, cy],
@@ -242,13 +233,10 @@ def pointcloud_registration(rend_image_pil, target_image_pil, original_size,
     del output
     desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
-    # find 2D-2D matches between the two images
     matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
                                                 device="cuda:0", dist='dot', block_size=2**13)
-    # ignore small border around the edge
     H0, W0 = view1['true_shape'][0]
     valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
         matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
@@ -267,7 +255,7 @@ def pointcloud_registration(rend_image_pil, target_image_pil, original_size,
         pixel[0] *= scale_x
         pixel[1] *= scale_y
     depth_map = rend_depth[0]
-    fx, fy, cx, cy = fxy.item(), fxy.item(), original_size[1]/2, original_size[0]/2  # Example values for focal lengths and principal point
     K = np.array([
         [fx, 0, cx],
         [0, fy, cy],
@@ -308,8 +296,7 @@ def pointcloud_registration(rend_image_pil, target_image_pil, original_size,
     scale_1 = dist_1[dist_1 < np.percentile(dist_1, 99)].mean()
     dist_2 = np.linalg.norm(points_3D_at_pixels_2 - points_3D_at_pixels_2.mean(axis=0), axis=1)
     scale_2 = dist_2[dist_2 < np.percentile(dist_2, 99)].mean()
-    # scale_1 = np.linalg.norm(points_3D_at_pixels - points_3D_at_pixels.mean(axis=0), axis=1).mean()
-    # scale_2 = np.linalg.norm(points_3D_at_pixels_2 - points_3D_at_pixels_2.mean(axis=0), axis=1).mean()
     points_3D_at_pixels_2 = points_3D_at_pixels_2 * (scale_1 / scale_2)
     pcd_1 = o3d.geometry.PointCloud()
     pcd_1.points = o3d.utility.Vector3dVector(points_3D_at_pixels)
@@ -334,7 +321,7 @@ def pointcloud_registration(rend_image_pil, target_image_pil, original_size,
     )
     return transformation_matrix, evaluation.fitness
-# [수정] @spaces.GPU 제거
 def generate_and_extract_glb(
     multiimages: List[Tuple[Image.Image, str]],
     seed: int,
@@ -354,9 +341,7 @@ def generate_and_extract_glb(
     trellis_stage2_start_t: float,
     req: gr.Request,
 ) -> Tuple[dict, str, str, str]:
-    """
-    Convert an image to a 3D model and extract GLB file.
-    """
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     image_files = [image[0] for image in multiimages]
@@ -376,23 +361,41 @@ def generate_and_extract_glb(
         },
         mode=multiimage_algo,
     )
     if refine == "Yes":
         try:
             images, alphas = load_and_preprocess_images(multiimages)
-            images, alphas = images.to(device), alphas.to(device)
             with torch.no_grad():
                 with torch.cuda.amp.autocast(dtype=pipeline.VGGT_dtype):
                     images = images[None]
-                    # [수정] 분산 배치된 VGGT_model 접근
-                    vggt = pipeline.VGGT_model if not hasattr(pipeline.VGGT_model, 'module') else pipeline.VGGT_model.module
-                    aggregated_tokens_list, ps_idx = vggt.aggregator(images)
                 # Predict Cameras
                 pose_enc = vggt.camera_head(aggregated_tokens_list)[-1]
-                # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
                 extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
                 # Predict Point Cloud
-                point_map, point_conf = vggt.point_head(aggregated_tokens_list, images, ps_idx)
-                del aggregated_tokens_list
                 mask = (alphas[:,0,...][...,None] > 0.8)
                 conf_threshold = np.percentile(point_conf.cpu().numpy(), 50)
                 confidence_mask = (point_conf[0] > conf_threshold) & (point_conf[0] > 1e-5)
@@ -442,21 +445,24 @@ def generate_and_extract_glb(
             scale = np.linalg.norm(distance, axis=1).max()
             voxel_size = 1/64*scale*2
             pcd = pcd.voxel_down_sample(voxel_size)
-            # pcd.points = o3d.utility.Vector3dVector((coords[:,1:].cpu().numpy() + 0.5) / 64 - 0.5)
             for k in range(len(image_files)):
                 images = torch.stack([TF.ToTensor()(render_image) for render_image in video['color']] + [TF.ToTensor()(image_files[k].convert("RGB"))], dim=0)
                 # if len(images) == 0:
                 with torch.no_grad():
                     with torch.cuda.amp.autocast(dtype=pipeline.VGGT_dtype):
-                        # predictions = vggt_model(images.cuda())
-                        vggt = pipeline.VGGT_model if not hasattr(pipeline.VGGT_model, 'module') else pipeline.VGGT_model.module
-                        aggregated_tokens_list, ps_idx = vggt.aggregator(images[None].cuda())
                     pose_enc = vggt.camera_head(aggregated_tokens_list)[-1]
                 extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
                 extrinsic, intrinsic = extrinsic[0], intrinsic[0]
                 extrinsic = torch.cat([extrinsic, torch.tensor([0,0,0,1])[None,None].repeat(extrinsic.shape[0], 1, 1).to(extrinsic.device)], dim=1)
-                del aggregated_tokens_list, ps_idx
                 target_extrinsic, target_intrinsic = align_camera(registration_num_frames, extrinsic, intrinsic, rend_extrinsics, rend_intrinsics)
                 fxy = target_intrinsic[:,0,0]
@@ -483,9 +489,9 @@ def generate_and_extract_glb(
                     target_image = images[registration_num_frames:].to(target_extrinsic.device)[j]
                     original_size = (rend_image.shape[1], rend_image.shape[2])
-                    import torchvision
-                    torchvision.utils.save_image(rend_image, 'rend_image_{}.png'.format(k))
-                    torchvision.utils.save_image(target_image, 'target_image_{}.png'.format(k))
                     mask_rend = (rend_image.detach().cpu() > 0).any(dim=0)
                     mask_target = (target_image.detach().cpu() > 0).any(dim=0)
@@ -516,7 +522,10 @@ def generate_and_extract_glb(
             target_intrinsics = torch.cat(target_intrinsics, dim=0)
             target_fitnesses_filtered = [x for x in target_fitnesses if x <= 1]
-            idx = target_fitnesses.index(max(target_fitnesses_filtered))
             target_transform = target_transforms[idx]
             down_pcd_align = copy.deepcopy(down_pcd).transform(target_transform)
             # pcd = o3d.geometry.PointCloud()
@@ -526,7 +535,7 @@ def generate_and_extract_glb(
                 o3d.pipelines.registration.TransformationEstimationPointToPoint(with_scaling=True),
                 o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration = 10000))
             down_pcd_align_2 = copy.deepcopy(down_pcd_align).transform(reg_p2p.transformation)
-            input_points = torch.tensor(np.asarray(down_pcd_align_2.points)).to(extrinsic.device).float()
             input_points = ((input_points + 0.5).clip(0, 1) * 64 - 0.5).to(torch.int32)
             outputs = pipeline.run_refine(
@@ -555,12 +564,10 @@ def generate_and_extract_glb(
             )
         except Exception as e:
             print(f"Error during refinement: {e}")
     # Render video
-    # import uuid
-    # output_id = str(uuid.uuid4())
-    # os.makedirs(f"{TMP_DIR}/{output_id}", exist_ok=True)
-    # video_path = f"{TMP_DIR}/{output_id}/preview.mp4"
-    # glb_path = f"{TMP_DIR}/{output_id}/mesh.glb"
     video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
     video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
     video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
@@ -581,11 +588,7 @@ def generate_and_extract_glb(
     return state, video_path, glb_path, glb_path
-# [수정] @spaces.GPU 제거
 def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
-    """
-    Extract a Gaussian splatting file from the generated 3D model.
-    """
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     gs, _ = unpack_state(state)
     gaussian_path = os.path.join(user_dir, 'sample.ply')
@@ -595,6 +598,7 @@ def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
 def prepare_multi_example() -> List[Image.Image]:
     multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
     images = []
     for case in multi_case:
@@ -611,14 +615,12 @@ def prepare_multi_example() -> List[Image.Image]:
 def split_image(image: Image.Image) -> List[Image.Image]:
-    """
-    Split a multi-view image into separate view images.
-    """
     image_np = np.array(image)
-    # [안정성 추가] 채널 3개짜리(RGB) 이미지가 들어올 경우 에러 방지
     if image_np.shape[-1] < 4:
         return [preprocess_image(image)]
     alpha = image_np[..., 3]
     alpha = np.any(alpha>0, axis=0)
     start_pos = np.where(~alpha[:-1] & alpha[1:])[0].tolist()
@@ -775,29 +777,36 @@ with demo:
     )
-# Launch the Gradio app - VRAM 4개 분산 최적화 적용
 if __name__ == "__main__":
     pipeline = TrellisVGGTTo3DPipeline.from_pretrained("Stable-X/trellis-vggt-v0-2")
     num_gpus = torch.cuda.device_count()
     if num_gpus >= 4:
-        # [VRAM 분산 핵심] 각 모델을 물리적으로 다른 GPU에 로드하여 OOM 방지
-        pipeline.to("cuda:0") # 메인 파이프라인
         if hasattr(pipeline, 'VGGT_model'):
             pipeline.VGGT_model.to("cuda:1")
         if hasattr(pipeline, 'birefnet_model'):
             pipeline.birefnet_model.to("cuda:2")
-        # 가장 무거운 디코더들을 3번 GPU로 격리
         if hasattr(pipeline, 'slat_decoder'):
             pipeline.slat_decoder.to("cuda:3")
         if hasattr(pipeline, 'sparse_structure_decoder'):
             pipeline.sparse_structure_decoder.to("cuda:3")
-        # Mast3r 모델 로드 (cuda:0 사용)
         mast3r_model = AsymmetricMASt3R.from_pretrained("naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric").to("cuda:0").eval()
-        print("Success: 4 GPU VRAM Sharding Activated.")
     else:
         pipeline.cuda()
         mast3r_model = AsymmetricMASt3R.from_pretrained("naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric").cuda().eval()
     demo.launch()

+import os
+import sys
+import subprocess
+import shutil
+import numpy as np
+# [1] Open3D 없으면 자동 설치 (ModuleNotFoundError 해결)
+try:
+    import open3d as o3d
+except ImportError:
+    print("Open3D not found. Installing...")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "open3d"])
+    import open3d as o3d
 import gradio as gr
+# @spaces.GPU 제거를 위해 spaces는 import 하되 데코레이터는 안 씁니다.
+import spaces
 from gradio_litmodel3d import LitModel3D
 os.environ['SPCONV_ALGO'] = 'native'
 from typing import *
 import torch
 import imageio
+import cv2
 from easydict import EasyDict as edict
 from PIL import Image
+from torchvision import transforms as TF
+import copy
+# Trellis 라이브러리
 from trellis.pipelines import TrellisVGGTTo3DPipeline
 from trellis.representations import Gaussian, MeshExtractResult
 from trellis.utils import render_utils, postprocessing_utils
+from trellis.utils.general_utils import *
+# 커스텀 휠 라이브러리 (원본 로직용)
+sys.path.append("wheels")
 from wheels.vggt.vggt.utils.load_fn import load_and_preprocess_images
 from wheels.vggt.vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from wheels.mast3r.model import AsymmetricMASt3R
 from wheels.mast3r.fast_nn import fast_reciprocal_NNs
 from wheels.dust3r.dust3r.inference import inference
 from wheels.dust3r.dust3r.utils.image import load_images_new
 MAX_SEED = np.iinfo(np.int32).max
 TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
 os.makedirs(TMP_DIR, exist_ok=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# --- 세션 관리 ---
 def start_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(user_dir, exist_ok=True)
 def end_session(req: gr.Request):
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     if os.path.exists(user_dir):
         shutil.rmtree(user_dir)
+# --- 전처리 함수들 (@spaces.GPU 제거됨) ---
 def preprocess_image(image: Image.Image) -> Image.Image:
     processed_image = pipeline.preprocess_image(image)
     return processed_image
 def preprocess_videos(video: str) -> List[Tuple[Image.Image, str]]:
     vid = imageio.get_reader(video, 'ffmpeg')
     fps = vid.get_meta_data()['fps']
     images = []
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
 def preprocess_images(images: List[Tuple[Image.Image, str]]) -> List[Image.Image]:
     images = [image[0] for image in images]
     processed_images = [pipeline.preprocess_image(image) for image in images]
     return processed_images
+# --- State 관리 ---
 def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
     return {
         'gaussian': {
             'faces': mesh.faces.cpu().numpy(),
         },
     }
 def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
     gs = Gaussian(
         aabb=state['gaussian']['aabb'],
         opacity_bias=state['gaussian']['opacity_bias'],
         scaling_activation=state['gaussian']['scaling_activation'],
     )
+    # 로드 시 메인 GPU(cuda:0)로 복귀
     gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda:0')
     gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda:0')
     gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda:0')
         vertices=torch.tensor(state['mesh']['vertices'], device='cuda:0'),
         faces=torch.tensor(state['mesh']['faces'], device='cuda:0'),
     )
     return gs, mesh
 def get_seed(randomize_seed: bool, seed: int) -> int:
     return np.random.randint(0, MAX_SEED) if randomize_seed else seed
+# --- [원본 복구] 정밀 수학/포즈 함수들 ---
+def perform_rodrigues_transformation(rvec):
+    R, _ = cv2.Rodrigues(rvec)
+    return R
+def align_camera(num_frames, extrinsic, intrinsic, rend_extrinsics, rend_intrinsics):
     extrinsic_tmp = extrinsic.clone()
     camera_relative = torch.matmul(extrinsic_tmp[:num_frames,:3,:3].permute(0,2,1), extrinsic_tmp[num_frames:,:3,:3])
     camera_relative_angle = torch.acos(((camera_relative[:,0,0] + camera_relative[:,1,1] + camera_relative[:,2,2] - 1) / 2).clamp(-1, 1))
 def refine_pose_mast3r(rend_image_pil, target_image_pil, original_size, fxy, target_extrinsic, rend_depth):
     images_mast3r = load_images_new([rend_image_pil, target_image_pil], size=512, square_ok=True)
     with torch.no_grad():
+        # [GPU 수정] mast3r 모델 추론 시 cuda:0 명시
         output = inference([tuple(images_mast3r)], mast3r_model, "cuda:0", batch_size=1, verbose=False)
     view1, pred1 = output['view1'], output['pred1']
     view2, pred2 = output['view2'], output['pred2']
     del output
     desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
     matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
                                                 device="cuda:0", dist='dot', block_size=2**13)
     H0, W0 = view1['true_shape'][0]
     valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
         matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
         pixel[0] *= scale_x
         pixel[1] *= scale_y
     depth_map = rend_depth[0]
+    fx, fy, cx, cy = fxy.item(), fxy.item(), original_size[1]/2, original_size[0]/2
     K = np.array([
         [fx, 0, cx],
         [0, fy, cy],
     del output
     desc1, desc2 = pred1['desc'].squeeze(0).detach(), pred2['desc'].squeeze(0).detach()
     matches_im0, matches_im1 = fast_reciprocal_NNs(desc1, desc2, subsample_or_initxy1=8,
                                                 device="cuda:0", dist='dot', block_size=2**13)
     H0, W0 = view1['true_shape'][0]
     valid_matches_im0 = (matches_im0[:, 0] >= 3) & (matches_im0[:, 0] < int(W0) - 3) & (
         matches_im0[:, 1] >= 3) & (matches_im0[:, 1] < int(H0) - 3)
         pixel[0] *= scale_x
         pixel[1] *= scale_y
     depth_map = rend_depth[0]
+    fx, fy, cx, cy = fxy.item(), fxy.item(), original_size[1]/2, original_size[0]/2
     K = np.array([
         [fx, 0, cx],
         [0, fy, cy],
     scale_1 = dist_1[dist_1 < np.percentile(dist_1, 99)].mean()
     dist_2 = np.linalg.norm(points_3D_at_pixels_2 - points_3D_at_pixels_2.mean(axis=0), axis=1)
     scale_2 = dist_2[dist_2 < np.percentile(dist_2, 99)].mean()
     points_3D_at_pixels_2 = points_3D_at_pixels_2 * (scale_1 / scale_2)
     pcd_1 = o3d.geometry.PointCloud()
     pcd_1.points = o3d.utility.Vector3dVector(points_3D_at_pixels)
     )
     return transformation_matrix, evaluation.fitness
+# [수정] 메인 생성 함수 (Refine 로직 100% 복구 + VRAM 분산 접근)
 def generate_and_extract_glb(
     multiimages: List[Tuple[Image.Image, str]],
     seed: int,
     trellis_stage2_start_t: float,
     req: gr.Request,
 ) -> Tuple[dict, str, str, str]:
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     image_files = [image[0] for image in multiimages]
         },
         mode=multiimage_algo,
     )
     if refine == "Yes":
         try:
             images, alphas = load_and_preprocess_images(multiimages)
+            # 이미지를 cuda:0 (또는 사용 가능한 GPU)으로 이동
+            images, alphas = images.to("cuda:0"), alphas.to("cuda:0")
             with torch.no_grad():
                 with torch.cuda.amp.autocast(dtype=pipeline.VGGT_dtype):
                     images = images[None]
+                    # [VRAM 분산 대응] VGGT_model이 다른 GPU에 있어도 호출 가능하도록 처리
+                    if hasattr(pipeline.VGGT_model, 'module'):
+                        vggt = pipeline.VGGT_model.module
+                    else:
+                        vggt = pipeline.VGGT_model
+                    # 입력 이미지를 VGGT 모델이 있는 GPU로 임시 이동
+                    target_device = next(vggt.parameters()).device
+                    images_in = images.to(target_device)
+                    aggregated_tokens_list, ps_idx = vggt.aggregator(images_in)
                 # Predict Cameras
                 pose_enc = vggt.camera_head(aggregated_tokens_list)[-1]
                 extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
                 # Predict Point Cloud
+                point_map, point_conf = vggt.point_head(aggregated_tokens_list, images_in, ps_idx)
+                # 결과물을 다시 CPU/메인 GPU로 가져와서 처리
+                point_map = point_map.to("cuda:0")
+                point_conf = point_conf.to("cuda:0")
+                extrinsic = extrinsic.to("cuda:0")
+                intrinsic = intrinsic.to("cuda:0")
+                del aggregated_tokens_list, images_in
                 mask = (alphas[:,0,...][...,None] > 0.8)
                 conf_threshold = np.percentile(point_conf.cpu().numpy(), 50)
                 confidence_mask = (point_conf[0] > conf_threshold) & (point_conf[0] > 1e-5)
             scale = np.linalg.norm(distance, axis=1).max()
             voxel_size = 1/64*scale*2
             pcd = pcd.voxel_down_sample(voxel_size)
             for k in range(len(image_files)):
                 images = torch.stack([TF.ToTensor()(render_image) for render_image in video['color']] + [TF.ToTensor()(image_files[k].convert("RGB"))], dim=0)
                 # if len(images) == 0:
                 with torch.no_grad():
                     with torch.cuda.amp.autocast(dtype=pipeline.VGGT_dtype):
+                        # [VRAM 분산 대응] VGGT_model 호출
+                        target_device = next(vggt.parameters()).device
+                        images_in = images[None].to(target_device)
+                        aggregated_tokens_list, ps_idx = vggt.aggregator(images_in)
                     pose_enc = vggt.camera_head(aggregated_tokens_list)[-1]
+                # 결과 회수
+                pose_enc = pose_enc.to("cuda:0")
                 extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
                 extrinsic, intrinsic = extrinsic[0], intrinsic[0]
                 extrinsic = torch.cat([extrinsic, torch.tensor([0,0,0,1])[None,None].repeat(extrinsic.shape[0], 1, 1).to(extrinsic.device)], dim=1)
+                del aggregated_tokens_list, ps_idx, images_in
                 target_extrinsic, target_intrinsic = align_camera(registration_num_frames, extrinsic, intrinsic, rend_extrinsics, rend_intrinsics)
                 fxy = target_intrinsic[:,0,0]
                     target_image = images[registration_num_frames:].to(target_extrinsic.device)[j]
                     original_size = (rend_image.shape[1], rend_image.shape[2])
+                    # import torchvision
+                    # torchvision.utils.save_image(rend_image, 'rend_image_{}.png'.format(k))
+                    # torchvision.utils.save_image(target_image, 'target_image_{}.png'.format(k))
                     mask_rend = (rend_image.detach().cpu() > 0).any(dim=0)
                     mask_target = (target_image.detach().cpu() > 0).any(dim=0)
             target_intrinsics = torch.cat(target_intrinsics, dim=0)
             target_fitnesses_filtered = [x for x in target_fitnesses if x <= 1]
+            if len(target_fitnesses_filtered) > 0:
+                idx = target_fitnesses.index(max(target_fitnesses_filtered))
+            else:
+                idx = 0
             target_transform = target_transforms[idx]
             down_pcd_align = copy.deepcopy(down_pcd).transform(target_transform)
             # pcd = o3d.geometry.PointCloud()
                 o3d.pipelines.registration.TransformationEstimationPointToPoint(with_scaling=True),
                 o3d.pipelines.registration.ICPConvergenceCriteria(max_iteration = 10000))
             down_pcd_align_2 = copy.deepcopy(down_pcd_align).transform(reg_p2p.transformation)
+            input_points = torch.tensor(np.asarray(down_pcd_align_2.points)).to("cuda:0").float()
             input_points = ((input_points + 0.5).clip(0, 1) * 64 - 0.5).to(torch.int32)
             outputs = pipeline.run_refine(
             )
         except Exception as e:
             print(f"Error during refinement: {e}")
+            import traceback
+            traceback.print_exc()
     # Render video
     video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
     video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
     video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
     return state, video_path, glb_path, glb_path
 def extract_gaussian(state: dict, req: gr.Request) -> Tuple[str, str]:
     user_dir = os.path.join(TMP_DIR, str(req.session_hash))
     gs, _ = unpack_state(state)
     gaussian_path = os.path.join(user_dir, 'sample.ply')
 def prepare_multi_example() -> List[Image.Image]:
+    if not os.path.exists("assets/example_multi_image"): return []
     multi_case = list(set([i.split('_')[0] for i in os.listdir("assets/example_multi_image")]))
     images = []
     for case in multi_case:
 def split_image(image: Image.Image) -> List[Image.Image]:
     image_np = np.array(image)
+    # [수정] 채널 체크: RGBA(4)가 아닐 경우 단일 처리
     if image_np.shape[-1] < 4:
         return [preprocess_image(image)]
     alpha = image_np[..., 3]
     alpha = np.any(alpha>0, axis=0)
     start_pos = np.where(~alpha[:-1] & alpha[1:])[0].tolist()
     )
+# [수정] 4 GPU VRAM 분산 배치 로직 (메인 실행부)
 if __name__ == "__main__":
+    # 1. 파이프라인 로드 (메인 GPU)
     pipeline = TrellisVGGTTo3DPipeline.from_pretrained("Stable-X/trellis-vggt-v0-2")
     num_gpus = torch.cuda.device_count()
+    print(f"시스템에서 감지된 GPU 개수: {num_gpus}")
     if num_gpus >= 4:
+        # [핵심] VRAM OOM 방지를 위해 모델을 4개 GPU에 수동으로 분산
+        pipeline.to("cuda:0") # 쉘은 0번
+        # 모델 이동
         if hasattr(pipeline, 'VGGT_model'):
             pipeline.VGGT_model.to("cuda:1")
         if hasattr(pipeline, 'birefnet_model'):
             pipeline.birefnet_model.to("cuda:2")
+        # 가장 무거운 디코더들을 3번으로 격리
         if hasattr(pipeline, 'slat_decoder'):
             pipeline.slat_decoder.to("cuda:3")
         if hasattr(pipeline, 'sparse_structure_decoder'):
             pipeline.sparse_structure_decoder.to("cuda:3")
+        # Refine용 Mast3r 모델은 0번 (혹은 메모리 여유 있는 곳)
         mast3r_model = AsymmetricMASt3R.from_pretrained("naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric").to("cuda:0").eval()
+        print("--- 4 GPU VRAM 분산 배치 완료 ---")
     else:
+        # GPU가 부족하면 일반 로드
         pipeline.cuda()
         mast3r_model = AsymmetricMASt3R.from_pretrained("naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric").cuda().eval()
     demo.launch()