JiaxinGe
/

Diffusers-BAGEL

Diffusers

Safetensors

BagelPipeline

Model card Files Files and versions

xet

Community

para-lost commited on Jul 25, 2025

Commit

782d4ce

1 Parent(s): 4897a76

pipeline update

Browse files

Files changed (1) hide show

pipeline.py +81 -176

pipeline.py CHANGED Viewed

@@ -6938,177 +6938,8 @@ class InterleaveInferencer:
             elif isinstance(i, str):
                 output_dict['text'] = i
         return output_dict
-# class BagelPipeline(DiffusionPipeline):
-#     """
-#     A “naive” Bagel wrapper that replicates your notebook exactly.
-#     """
-#     model_cpu_offload_seq = "bagel_model"
-#     def __init__(
-#         self,
-#         torch_dtype: torch.dtype = torch.bfloat16,
-#     ):
-#         super().__init__()
-#         self._dtype = torch_dtype
-#         self._built = False
-#         self._inferencer = None
-#         self.new_token_ids: List[int] = []
-#         # Hard‐code default weights path; overridden by from_pretrained
-#         self.weights_root: Optional[str] = None
-#         self.register_to_config(weights_root=self.weights_root, torch_dtype=torch_dtype)
-#         repo_id = "ByteDance-Seed/BAGEL-7B-MoT"
-#         model_path = snapshot_download(repo_id=repo_id)
-#         print("loaded from ", model_path)
-#         # LLM config preparing
-#         llm_config = Qwen2Config.from_json_file(os.path.join(model_path, "llm_config.json"))
-#         llm_config.qk_norm = True
-#         llm_config.tie_word_embeddings = False
-#         llm_config.layer_module = "Qwen2MoTDecoderLayer"
-#         # ViT config preparing
-#         vit_config = SiglipVisionConfig.from_json_file(os.path.join(model_path, "vit_config.json"))
-#         vit_config.rope = False
-#         vit_config.num_hidden_layers = vit_config.num_hidden_layers - 1
-#         # VAE loading
-#         vae_model, vae_config = load_ae(local_path=os.path.join(model_path, "ae.safetensors"))
-#         # Bagel config preparing
-#         config = BagelConfig(
-#             visual_gen=True,
-#             visual_und=True,
-#             llm_config=llm_config,
-#             vit_config=vit_config,
-#             vae_config=vae_config,
-#             vit_max_num_patch_per_side=70,
-#             connector_act='gelu_pytorch_tanh',
-#             latent_patch_size=2,
-#             max_latent_size=64,
-#         )
-#         with init_empty_weights():
-#             language_model = Qwen2ForCausalLM(llm_config)
-#             vit_model      = SiglipVisionModel(vit_config)
-#             model          = Bagel(language_model, vit_model, config)
-#             model.vit_model.vision_model.embeddings.convert_conv2d_to_linear(vit_config, meta=True)
-#         # Tokenizer Preparing
-#         tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
-#         tokenizer, new_token_ids, _ = add_special_tokens(tokenizer)
-#         # Image Transform Preparing
-#         vae_transform = ImageTransform(1024, 512, 16)
-#         vit_transform = ImageTransform(980, 224, 14)
-#         # set cuda device to 4
-#         max_mem_per_gpu = "40GiB"  # Modify it according to your GPU setting. On an A100, 80 GiB is sufficient to load on a single GPU.
-#         device_map = infer_auto_device_map(
-#             model,
-#             max_memory={i: max_mem_per_gpu for i in range(torch.cuda.device_count())},
-#             no_split_module_classes=["Bagel", "Qwen2MoTDecoderLayer"],
-#         )
-#         print(device_map)
-#         same_device_modules = [
-#             'language_model.model.embed_tokens',
-#             'time_embedder',
-#             'latent_pos_embed',
-#             'vae2llm',
-#             'llm2vae',
-#             'connector',
-#             'vit_pos_embed'
-#         ]
-#         if torch.cuda.device_count() == 1:
-#             first_device = device_map.get(same_device_modules[0], "cuda:0")
-#             for k in same_device_modules:
-#                 if k in device_map:
-#                     device_map[k] = first_device
-#                 else:
-#                     device_map[k] = "cuda:0"
-#         else:
-#             first_device = device_map.get(same_device_modules[0])
-#             for k in same_device_modules:
-#                 if k in device_map:
-#                     device_map[k] = first_device
-#         # Thanks @onion-liu: https://github.com/ByteDance-Seed/Bagel/pull/8
-#         model = load_checkpoint_and_dispatch(
-#             model,
-#             checkpoint=os.path.join(model_path, "ema.safetensors"),
-#             device_map=device_map,
-#             offload_buffers=True,
-#             dtype=torch.bfloat16,
-#             force_hooks=True,
-#             offload_folder="/tmp/offload"
-#         )
-#         model = model.eval()
-#         print('Model loaded')
-#         self._inferencer = InterleaveInferencer(
-#             model=model,
-#             vae_model=vae_model,
-#             tokenizer=tokenizer,
-#             vae_transform=vae_transform,
-#             vit_transform=vit_transform,
-#             new_token_ids=new_token_ids
-#         )
-#         seed = 42
-#         random.seed(seed)
-#         np.random.seed(seed)
-#         torch.manual_seed(seed)
-#         if torch.cuda.is_available():
-#             torch.cuda.manual_seed(seed)
-#             torch.cuda.manual_seed_all(seed)
-#         torch.backends.cudnn.deterministic = True
-#         torch.backends.cudnn.benchmark = False
-#     @torch.no_grad()
-#     def __call__(
-#         self,
-#         prompt: str,
-#         think=False,
-#         cfg_text_scale: float = 4.0,
-#         cfg_img_scale: float = 1.0,
-#         cfg_interval=(0.4, 1.0),
-#         timestep_shift: float = 3.0,
-#         num_timesteps: int = 50,
-#         cfg_renorm_min: float = 0.0,
-#         cfg_renorm_type: str = "global",
-#         seed: Optional[int] = None,
-#         output_type: str = "pil",
-#         return_dict: bool = True,
-#         **unused,
-#     ):
-#         if seed is not None:
-#             torch.manual_seed(seed)
-#             if torch.cuda.is_available():
-#                 torch.cuda.manual_seed_all(seed)
-#         inference_kwargs = dict(
-#             text=prompt,
-#             think=think,
-#             cfg_text_scale=cfg_text_scale,
-#             cfg_img_scale=cfg_img_scale,
-#             cfg_interval=list(cfg_interval),
-#             timestep_shift=timestep_shift,
-#             num_timesteps=num_timesteps,
-#             cfg_renorm_min=cfg_renorm_min,
-#             cfg_renorm_type=cfg_renorm_type,
-#         )
-#         result = self._inferencer(**inference_kwargs)
-#         image = result["image"] if isinstance(result, dict) else result
-#         if return_dict:
-#             return {"images": [image]}
-#         return [image]
 class BagelPipeline(DiffusionPipeline):
     model_cpu_offload_seq = "bagel_model"
@@ -7130,11 +6961,85 @@ class BagelPipeline(DiffusionPipeline):
             new_token_ids= new_token_ids,
         )
-    def __call__(self, prompt: str, **infer_kwargs):
-        result = self._inferencer(text=prompt, **infer_kwargs)
-        img    = result["image"] if isinstance(result, dict) else result
-        return {"images": [img]}
     def to(self, device):
         super().to(device)          # moves registered modules

             elif isinstance(i, str):
                 output_dict['text'] = i
         return output_dict
+from diffusers import DiffusionPipeline, PipelineOutput
 class BagelPipeline(DiffusionPipeline):
     model_cpu_offload_seq = "bagel_model"
             new_token_ids= new_token_ids,
         )
+    def __call__(
+        self,
+        *,
+        image: Optional[Image.Image] = None,
+        text: Optional[str] = None,
+        think: bool = False,
+        understanding_output: bool = False,
+        **infer_kwargs
+    ) -> PipelineOutput:
+        """
+        Supports:
+          - text→image            (pass text=…)
+          - text→image + think    (+ think=True)
+          - image→image edit      (pass image=…, text=…)
+          - image→image+think     (+ think=True)
+          - image→understanding   (+ understanding_output=True)
+        Any other kwargs (cfg_text_scale, num_timesteps, etc.) override the defaults below.
+        """
+        if text is not None and image is None:
+            defaults: Dict[str, Any] = {
+                "cfg_text_scale": 4.0,
+                "cfg_img_scale": 1.0,
+                "cfg_interval": (0.4, 1.0),
+                "timestep_shift": 3.0,
+                "num_timesteps": 50,
+                "cfg_renorm_min": 0.0,
+                "cfg_renorm_type": "global",
+            }
+            if think:
+                defaults.update({
+                    "max_think_token_n": 1000,
+                    "do_sample": False,
+                    "text_temperature": 0.3,
+                })
+        elif image is not None and text is not None and not understanding_output:
+            defaults = {
+                "cfg_text_scale": 4.0,
+                "cfg_img_scale": 2.0,
+                "cfg_interval": (0.0, 1.0),
+                "timestep_shift": 3.0,
+                "num_timesteps": 50,
+                "cfg_renorm_min": 0.0,
+                "cfg_renorm_type": "text_channel",
+            }
+            if think:
+                defaults.update({
+                    "max_think_token_n": 1000,
+                    "do_sample": False,
+                    "text_temperature": 0.3,
+                })
+        elif image is not None and understanding_output:
+            defaults = {
+                "max_think_token_n": 1000,
+                "do_sample": False,
+            }
+        else:
+            defaults = {}
+        for k, v in defaults.items():
+            infer_kwargs.setdefault(k, v)
+        result: Dict[str, Any] = self._inferencer(
+            image=image,
+            text=text,
+            think=think,
+            understanding_output=understanding_output,
+            **infer_kwargs,
+        )
+        out_kwargs: Dict[str, Any] = {}
+        if result.get("image") is not None:
+            out_kwargs["images"] = [result["image"]]
+        if result.get("text") is not None:
+            out_kwargs["text"] = result["text"]
+        return PipelineOutput(**out_kwargs)
     def to(self, device):
         super().to(device)          # moves registered modules