Spaces:

opsiclear-admin
/

Trellis.2.multiview

Running on Zero

App Files Files Community

opsiclear-admin commited on 14 days ago

Commit

a0ea9e7

verified ·

1 Parent(s): 1de8ccc

Fix multidiffusion CFG and add per-stage texture mode

Browse files

Files changed (1) hide show

trellis2/pipelines/trellis2_image_to_3d.py +37 -21

trellis2/pipelines/trellis2_image_to_3d.py CHANGED Viewed

@@ -637,23 +637,36 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         elif mode == 'multidiffusion':
             from .samplers import FlowEulerSampler
-            def _new_inference_model(self, model, x_t, t, cond, **kwargs):
-                # if cfg_interval[0] <= t <= cfg_interval[1]:
-                #     preds = []
-                #     for i in range(len(cond)):
-                #         preds.append(FlowEulerSampler._inference_model(self, model, x_t, t, cond[i:i + 1], **kwargs))
-                #     pred = sum(preds) / len(preds)
-                #     neg_pred = FlowEulerSampler._inference_model(self, model, x_t, t, neg_cond, **kwargs)
-                #     return (1 + cfg_strength) * pred - cfg_strength * neg_pred
-                # else:
-                # Filter out guidance-related kwargs that the base sampler doesn't handle
-                filtered_kwargs = {k: v for k, v in kwargs.items()
-                                   if k not in ('neg_cond', 'guidance_strength', 'guidance_interval', 'guidance_rescale')}
                 preds = []
                 for i in range(len(cond)):
-                    preds.append(FlowEulerSampler._inference_model(self, model, x_t, t, cond[i:i + 1], **filtered_kwargs))
-                pred = sum(preds) / len(preds)
                 return pred
         else:
@@ -679,7 +692,8 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             return_latent: bool = False,
             pipeline_type: Optional[str] = None,
             max_num_tokens: int = 49152,
-            mode: Literal['stochastic', 'multidiffusion'] = 'multidiffusion',
     ) -> List[MeshWithVoxel]:
         """
         Run the multi-image pipeline.
@@ -695,8 +709,10 @@ class Trellis2ImageTo3DPipeline(Pipeline):
             return_latent (bool): Whether to return the latent codes.
             pipeline_type (str): The type of the pipeline. Options: '512', '1024', '1024_cascade', '1536_cascade'.
             max_num_tokens (int): The maximum number of tokens to use.
-            mode: The multi-image conditioning mode.
         """
         # Check pipeline type
         pipeline_type = pipeline_type or self.default_pipeline_type
         if pipeline_type == '512':
@@ -740,7 +756,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         if pipeline_type == '512':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
-                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=mode),
             ):
                 shape_slat = self.sample_shape_slat(
                     cond_512, self.models['shape_slat_flow_model_512'],
@@ -754,7 +770,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         elif pipeline_type == '1024':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
-                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=mode),
             ):
                 shape_slat = self.sample_shape_slat(
                     cond_1024, self.models['shape_slat_flow_model_1024'],
@@ -768,7 +784,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         elif pipeline_type == '1024_cascade':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
-                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=mode),
             ):
                 shape_slat, res = self.sample_shape_slat_cascade(
                     cond_512, cond_1024,
@@ -784,7 +800,7 @@ class Trellis2ImageTo3DPipeline(Pipeline):
         elif pipeline_type == '1536_cascade':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
-                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=mode),
             ):
                 shape_slat, res = self.sample_shape_slat_cascade(
                     cond_512, cond_1024,

         elif mode == 'multidiffusion':
             from .samplers import FlowEulerSampler
+            def _new_inference_model(self, model, x_t, t, cond,
+                                     neg_cond=None, guidance_strength=1,
+                                     guidance_interval=(0, 1), guidance_rescale=0.0,
+                                     **kwargs):
+                # Average per-image positive predictions
                 preds = []
                 for i in range(len(cond)):
+                    preds.append(FlowEulerSampler._inference_model(self, model, x_t, t, cond[i:i + 1], **kwargs))
+                avg_pred = sum(preds) / len(preds)
+                # Apply guidance interval
+                if not (guidance_interval[0] <= t <= guidance_interval[1]):
+                    guidance_strength = 1
+                # Apply CFG
+                if guidance_strength == 1 or neg_cond is None:
+                    return avg_pred
+                neg_pred = FlowEulerSampler._inference_model(self, model, x_t, t, neg_cond, **kwargs)
+                pred = guidance_strength * avg_pred + (1 - guidance_strength) * neg_pred
+                # Apply guidance rescale
+                if guidance_rescale > 0:
+                    x_0_pos = self._pred_to_xstart(x_t, t, avg_pred)
+                    x_0_cfg = self._pred_to_xstart(x_t, t, pred)
+                    std_pos = x_0_pos.std(dim=list(range(1, x_0_pos.ndim)), keepdim=True)
+                    std_cfg = x_0_cfg.std(dim=list(range(1, x_0_cfg.ndim)), keepdim=True)
+                    x_0_rescaled = x_0_cfg * (std_pos / std_cfg)
+                    x_0 = guidance_rescale * x_0_rescaled + (1 - guidance_rescale) * x_0_cfg
+                    pred = self._xstart_to_pred(x_t, t, x_0)
                 return pred
         else:
             return_latent: bool = False,
             pipeline_type: Optional[str] = None,
             max_num_tokens: int = 49152,
+            mode: Literal['stochastic', 'multidiffusion'] = 'stochastic',
+            tex_mode: Optional[Literal['stochastic', 'multidiffusion']] = None,
     ) -> List[MeshWithVoxel]:
         """
         Run the multi-image pipeline.
             return_latent (bool): Whether to return the latent codes.
             pipeline_type (str): The type of the pipeline. Options: '512', '1024', '1024_cascade', '1536_cascade'.
             max_num_tokens (int): The maximum number of tokens to use.
+            mode: The multi-image conditioning mode for structure and shape.
+            tex_mode: The multi-image conditioning mode for texture. If None, uses mode.
         """
+        tex_mode = tex_mode or mode
         # Check pipeline type
         pipeline_type = pipeline_type or self.default_pipeline_type
         if pipeline_type == '512':
         if pipeline_type == '512':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
+                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=tex_mode),
             ):
                 shape_slat = self.sample_shape_slat(
                     cond_512, self.models['shape_slat_flow_model_512'],
         elif pipeline_type == '1024':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
+                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=tex_mode),
             ):
                 shape_slat = self.sample_shape_slat(
                     cond_1024, self.models['shape_slat_flow_model_1024'],
         elif pipeline_type == '1024_cascade':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
+                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=tex_mode),
             ):
                 shape_slat, res = self.sample_shape_slat_cascade(
                     cond_512, cond_1024,
         elif pipeline_type == '1536_cascade':
             with (
                 self.inject_sampler_multi_image('shape_slat_sampler', len(images), shape_slat_steps, mode=mode),
+                self.inject_sampler_multi_image('tex_slat_sampler', len(images), tex_slat_steps, mode=tex_mode),
             ):
                 shape_slat, res = self.sample_shape_slat_cascade(
                     cond_512, cond_1024,