Spaces:

jiuhai
/

Florence-VL-3B

Build error

App Files Files Community

jiuhai commited on Aug 20

Commit

6858cdd

verified ·

1 Parent(s): 6b139fc

Upload 59 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
blip3o/__init__.py +0 -0
blip3o/__pycache__/__init__.cpython-310.pyc +0 -0
blip3o/__pycache__/__init__.cpython-311.pyc +0 -0
blip3o/__pycache__/constants.cpython-310.pyc +0 -0
blip3o/__pycache__/constants.cpython-311.pyc +0 -0
blip3o/__pycache__/utils.cpython-310.pyc +0 -0
blip3o/__pycache__/utils.cpython-311.pyc +0 -0
blip3o/constants.py +7 -0
blip3o/conversation.py +296 -0
blip3o/data/__init__.py +1 -0
blip3o/data/__pycache__/__init__.cpython-310.pyc +0 -0
blip3o/data/__pycache__/__init__.cpython-311.pyc +0 -0
blip3o/data/__pycache__/dataset.cpython-310.pyc +0 -0
blip3o/data/__pycache__/dataset.cpython-311.pyc +0 -0
blip3o/data/dataset.py +371 -0
blip3o/mm_utils.py +65 -0
blip3o/model/__init__.py +3 -0
blip3o/model/__pycache__/__init__.cpython-310.pyc +0 -0
blip3o/model/__pycache__/__init__.cpython-311.pyc +0 -0
blip3o/model/__pycache__/blip3o_arch.cpython-310.pyc +0 -0
blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc +0 -0
blip3o/model/__pycache__/llava_arch.cpython-310.pyc +0 -0
blip3o/model/__pycache__/llava_arch.cpython-311.pyc +0 -0
blip3o/model/blip3o_arch.py +400 -0
blip3o/model/builder.py +44 -0
blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc +0 -0
blip3o/model/language_model/__pycache__/blip3o_qwen_grpo.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/blip3o_qwen_grpo.cpython-311.pyc +0 -0
blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen_grpo.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen_grpo.cpython-311.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen_inference.cpython-310.pyc +0 -0
blip3o/model/language_model/__pycache__/llava_qwen_inference.cpython-311.pyc +0 -0
blip3o/model/language_model/blip3o_qwen.py +215 -0
blip3o/model/language_model/blip3o_qwen_grpo.py +255 -0
blip3o/model/language_model/blip3o_qwen_inference.py +241 -0
blip3o/model/multimodal_decoder/__pycache__/builder.cpython-310.pyc +0 -0
blip3o/model/multimodal_decoder/__pycache__/builder.cpython-311.pyc +0 -0
blip3o/model/multimodal_decoder/__pycache__/ta_tok_encoder.cpython-310.pyc +0 -0
blip3o/model/multimodal_decoder/__pycache__/ta_tok_encoder.cpython-311.pyc +0 -0
blip3o/model/multimodal_decoder/builder.py +14 -0
blip3o/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc +0 -0
blip3o/model/multimodal_encoder/__pycache__/ta_tok_encoder.cpython-310.pyc +0 -0
blip3o/model/multimodal_encoder/__pycache__/ta_tok_encoder.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+blip3o/train/__pycache__/grpo_trainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

blip3o/__init__.py ADDED Viewed

File without changes

blip3o/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (145 Bytes). View file

blip3o/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (156 Bytes). View file

blip3o/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (357 Bytes). View file

blip3o/__pycache__/constants.cpython-311.pyc ADDED Viewed

Binary file (389 Bytes). View file

blip3o/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (610 Bytes). View file

blip3o/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (1.02 kB). View file

blip3o/constants.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

blip3o/conversation.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import base64
+import dataclasses
+import re
+from enum import Enum, auto
+from io import BytesIO
+from typing import Any, Dict, List, Tuple, Union
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    QWEN = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer_id: str = ""
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0]
+            if "mmtag" in self.version:
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            elif not init_msg.startswith("<image>"):
+                init_msg = init_msg.replace("<image>", "").strip()
+                messages[0] = (init_role, "<image>\n" + init_msg)
+            else:
+                messages[0] = (init_role, init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format="PNG"):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if type(image) is not Image.Image:
+            image = Image.open(image).convert("RGB")
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        max_len, min_len = 672, 448
+        shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False, return_path=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    for img in image:
+                        if not return_path and self.is_image_file(img):
+                            img = self.process_image(img, image_process_mode, return_pil=return_pil)
+                        else:
+                            images.append(img)
+        return images
+    def is_image_file(self, filename):
+        image_extensions = [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"]
+        return any(filename.lower().endswith(ext) for ext in image_extensions)
+    def is_video_file(self, filename):
+        video_extensions = [".mp4", ".mov", ".avi", ".mkv", ".wmv", ".flv", ".mpeg", ".mpg"]
+        return any(filename.lower().endswith(ext) for ext in video_extensions)
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    if type(image) != list:
+                        image = [image]
+                    if len(image) == 1:
+                        msg = "<image>\n" + msg.replace("<image>", "").strip()
+                    else:
+                        msg = re.sub(r"(<image>)\n(?=<image>)", r"\1 ", msg)
+                    img_str_list = []
+                    for img in image:
+                        if self.is_image_file(img):
+                            img_b64_str = self.process_image(img, "Default", return_pil=False, image_format="JPEG")
+                            img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" style="max-width: 256px; max-height: 256px; width: auto; height: auto; object-fit: contain;"/>'
+                            img_str_list.append(img_str)
+                        elif self.is_video_file(img):
+                            ret.append(((img,), None))
+                    msg = msg.strip()
+                    img_place_holder = ""
+                    for img_str in img_str_list:
+                        img_place_holder += f"{img_str}\n\n"
+                    if len(img_str_list) > 0:
+                        msg = f"{img_place_holder}\n\n{msg}"
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(system=self.system, roles=self.roles, messages=[[x, y] for x, y in self.messages], offset=self.offset, sep_style=self.sep_style, sep=self.sep, sep2=self.sep2, version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=[
+        ["Human", "What are the key differences between renewable and non-renewable energy sources?"],
+        [
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ],
+    ],
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+default_conversation = conv_vicuna_v0
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "qwen_1_5": conv_qwen,
+    "qwen_2": conv_qwen,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

blip3o/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dataset import *

blip3o/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (174 Bytes). View file

blip3o/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

blip3o/data/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (11.4 kB). View file

blip3o/data/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (20.9 kB). View file

blip3o/data/dataset.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import copy
+import glob
+import io
+import json
+import math
+import os
+import random
+import re
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Sequence
+import pyarrow.parquet as pq
+import torch
+import transformers
+import yaml
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset
+from torchvision.transforms import v2
+from torchvision import transforms
+from datasets import load_dataset, concatenate_datasets
+from blip3o.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from blip3o.utils import rank0_print
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+## target transform for sana
+target_transform = v2.Compose(
+    [
+        v2.Resize(1024),
+        v2.CenterCrop(1024),
+        v2.ToImage(),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize([0.5], [0.5]),
+    ]
+    )
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def preprocess_multimodal(sources: Sequence[str], data_args) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            replace_token = DEFAULT_IMAGE_TOKEN
+            # NOTE: only add im_start_end when image generation
+            if data_args.mm_use_im_start_end and sentence['from'] == 'gpt':
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+            # For videoInstruct-100k noisy_data. TODO: Ask Yuanhan to clean the data instead of leaving the noise code here.
+            sentence["value"] = sentence["value"].replace("QA_GT_caption_based_noisy", "")
+    return sources
+def preprocess_qwen(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant.") -> Dict:
+    # roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
+    roles = {"human": "user", "gpt": "assistant"}
+    #tokenizer = copy.deepcopy(tokenizer)
+    # When there is actually an image, we add the image tokens as a special token
+    if 'image_token_index' not in globals():
+        tokenizer.add_tokens(["<image>"], special_tokens=True)
+        global image_token_index
+        image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    # if has_image:
+    #     tokenizer.add_tokens(["<image>"], special_tokens=True)
+    # image_token_index = tokenizer.convert_tokens_to_ids("<image>")
+    im_start, im_end = tokenizer.additional_special_tokens_ids[:2]
+    # unmask_tokens = ["<|im_start|>", "<|im_start|>", "\n"]
+    unmask_tokens_idx =  [198, im_start, im_end]
+    # nl_tokens = tokenizer("\n").input_ids
+    # Reset Qwen chat templates so that it won't include system message every time we apply
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+    # _system = tokenizer("system").input_ids + nl_tokens
+    # _user = tokenizer("user").input_ids + nl_tokens
+    # _assistant = tokenizer("assistant").input_ids + nl_tokens
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["human"]:
+            source = source[1:]
+        input_id, target = [], []
+        # New version, use apply chat template
+        # Build system message for each sentence
+        input_id += tokenizer.apply_chat_template([{"role" : "system", "content" : system_message}])
+        # target += [IGNORE_INDEX] * len(input_id)
+        target += input_id
+        for conv in source:
+            # Make sure blip3o data can load
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+            role =  roles.get(role, role)
+            conv = [{"role" : role, "content" : content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                # target += [IGNORE_INDEX] * len(encode_id)
+                target += encode_id
+            else:
+                target += encode_id
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        for idx, encode_id in enumerate(input_id):
+            if encode_id in unmask_tokens_idx:
+                target[idx] = encode_id
+            if encode_id == image_token_index:
+                input_id[idx] = IMAGE_TOKEN_INDEX
+        input_ids.append(input_id)
+        targets.append(target)
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+class LazySupervisedMixDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizer,
+        data_path: str,
+        data_args
+    ):
+        super(LazySupervisedMixDataset, self).__init__()
+        self.data_args = data_args
+        list_data_dict = []
+        data_files = glob.glob('/fsx/sfr/data/jiuhai/hub/datasets--BLIP3o--BLIP3o-60k/snapshots/f7316b0aa446338ee1707484924aa59457b4bbf3/*.tar')
+        data_files.sort()
+        train_dataset = load_dataset("webdataset", data_files=data_files, split="train", num_proc=1, cache_dir='/fsx/sfr/data/jiuhai/webdataset')
+        train_dataset = train_dataset.rename_column("jpg", "image")
+        train_dataset = train_dataset.add_column('type', len(train_dataset) * ['T2I'])
+        train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if not col in (
+            ["image", "txt", "type"])])
+        print(f"finish loading image {len(train_dataset)}")
+        list_data_dict.append(train_dataset)
+        if len(list_data_dict) > 1:
+            list_data_dict = concatenate_datasets(list_data_dict)
+        else:
+            list_data_dict = list_data_dict[0]
+        list_data_dict = list_data_dict.shuffle(seed=42)
+        rank0_print(f"Totoal number of training instance: {len(list_data_dict)}")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.modality = torch.tensor(0) # 0 is for und task, 1 is for gen task
+    def __len__(self):
+        return len(self.list_data_dict)
+    def process_image(self, image):
+        processor = self.data_args.image_processor
+        image_size = image.size
+        image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        return image, image_size, self.modality
+    def process_target_image(self, image):
+        image = target_transform(image)
+        return image
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if "image" in sample else -cur_len
+            length_list.append(cur_len)
+        return length_list
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        while True:
+            sources = self.list_data_dict[i]
+            if sources["type"] == "T2I":
+                sources["conversations"] = [
+                    {"from": "human", "value": f"Please generate image based on the following caption: {sources['txt']}"},
+                    {"from": "gpt", "value": "<image>"},
+                ]
+            elif sources["type"] == "I2I":
+                sources["conversations"] = [
+                    {
+                        "from": "human",
+                        "value": f"<image>\nPlease reconstruct the given image.",
+                    },
+                    {"from": "gpt", "value": ""},
+                ]
+            else:
+                raise ValueError("Unknown source type. Please check the 'type' in 'sources'.")
+            if "image" in sources:
+                if sources["type"] == "T2I" or sources["type"] == "I2I":
+                    image_files = self.list_data_dict[i]["image"]
+                if not isinstance(image_files, list):
+                    image_files = [image_files]
+                images = []
+                for img in image_files:
+                    try:
+                        if sources["type"] == "T2I" or sources["type"] == "I2I":
+                            img = img.convert("RGB")
+                        else:
+                            raise ValueError("Unknown source type. Please check the 'type' in 'sources'.")
+                        images.append(img)
+                    except Exception as e:
+                        print(f"Error opening image {img}: {e}")
+                        images = None
+                        break  # Skip to the next image if there's an error
+                ## test if can apply img_process
+                if not images is None:
+                    try:
+                        process_images = [self.process_image(f) for f in images]
+                    except Exception as e:
+                        print(f"Error wrong number of channels: {e}")
+                        images = None
+                # If no valid images were found, randomly pick another item
+                if images is None:
+                    print(sources)
+                    print(f"warning false image!!!!!!")
+                    i = random.randint(0, len(self.list_data_dict) - 1)
+                    continue
+                sources = preprocess_multimodal(copy.deepcopy([sources["conversations"]]), self.data_args)
+            else:
+                sources = copy.deepcopy([sources["conversations"]])
+            data_dict = preprocess_qwen(sources, self.tokenizer, has_image=("image" in self.list_data_dict[i]))
+            if isinstance(i, int):
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+            # image exist in the data
+            if "image" in self.list_data_dict[i]:
+                data_dict["image"] = process_images
+                data_dict["target_image"] = [self.process_target_image(f) for f in images]
+            data_dict["ids"] = self.list_data_dict[i]["id"] if "id" in self.list_data_dict[i] else "unk"
+            return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = [_input_ids[: self.tokenizer.model_max_length] for _input_ids in input_ids]
+        labels = [_labels[: self.tokenizer.model_max_length] for _labels in labels]
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = 0 # This gets the best result. Don't know why.
+        input_ids = self.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        labels = self.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        batch = dict(input_ids=input_ids, labels=labels.long() if labels.dtype == torch.int32 else labels, attention_mask=input_ids.ne(self.tokenizer.pad_token_id))
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+            batch["image_sizes"] = [im[1] for im_list in images for im in im_list]
+            batch["modalities"] = [im[2] for im_list in images for im in im_list]
+            images = [im[0] for im_list in images for im in im_list]
+            batch["images"] = images
+            target_images = [instance["target_image"][0] for instance in instances]
+            target_images = torch.stack(target_images, dim=0) if target_images else None
+            batch["target_images"] = target_images
+        if "prompt" in instances[0]:
+            batch["prompts"] = [instance["prompt"] for instance in instances]
+        return batch
+def get_dataset_cls(name):
+    if name == 'mix':
+        dataset_cls = LazySupervisedMixDataset
+    else:
+        raise ValueError(f'Unknown dataset class {name}')
+    return dataset_cls
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = get_dataset_cls(data_args.dataset_cls)
+    train_dataset = dataset_cls(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)

blip3o/mm_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from transformers import StoppingCriteria
+from blip3o.constants import IMAGE_TOKEN_INDEX
+def process_images(images, image_processor, model_cfg):
+    return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

blip3o/model/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from blip3o.model.language_model.blip3o_qwen import blip3oQwenConfig, blip3oQwenForCausalLM
+from blip3o.model.language_model.blip3o_qwen_inference import blip3oQwenForInferenceLM
+from blip3o.model.language_model.blip3o_qwen_grpo import blip3oQwenForGRPOLM

blip3o/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (449 Bytes). View file

blip3o/model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (512 Bytes). View file

blip3o/model/__pycache__/blip3o_arch.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc ADDED Viewed

Binary file (26.1 kB). View file

blip3o/model/__pycache__/llava_arch.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

blip3o/model/__pycache__/llava_arch.cpython-311.pyc ADDED Viewed

Binary file (26 kB). View file

blip3o/model/blip3o_arch.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import os
+import random
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from blip3o.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
+from blip3o.utils import rank0_print
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_decoder.builder import build_sana, build_vae
+from diffusers.models.normalization import RMSNorm
+from diffusers import AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaTransformer2DModel
+import math
+class blip3oMetaModel:
+    def __init__(self, config):
+        super(blip3oMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            delay_load = getattr(config, "delay_load", False)
+            self.vision_tower = build_vision_tower(config, delay_load=delay_load)
+            self.sana = build_sana(config)
+            self.sana_vae = build_vae(config)
+            norm = RMSNorm(2304, eps=1e-5, elementwise_affine=True)
+            with torch.no_grad():
+                norm.weight.fill_(math.sqrt(5.5))
+            self.diffusion_connector = nn.Sequential(
+                nn.Linear(config.hidden_size, 2304),
+                nn.GELU(approximate="tanh"),
+                nn.Linear(2304, 2304),
+                norm,
+            )
+            self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(config.diffusion_name_or_path, subfolder="scheduler")
+            self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(config.diffusion_name_or_path, subfolder="scheduler")
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_sana(self):
+        sana = getattr(self, 'sana', None)
+        if type(sana) is list:
+            sana = sana[0]
+        if sana is not None:
+            sana.to(self.device)
+        return sana
+    def get_sana_vae(self):
+        sana_vae = getattr(self, 'sana_vae', None)
+        if type(sana_vae) is list:
+            sana_vae = sana_vae[0]
+        if sana_vae is not None:
+            sana_vae.to(self.device)
+        return sana_vae
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        mm_patch_merge_type = model_args.mm_patch_merge_type
+        self.config.mm_vision_tower = vision_tower
+        self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+        if self.get_sana() is None:
+            sana = build_sana(model_args)
+            self.noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_args.diffusion_name_or_path, subfolder="scheduler"
+            )
+            self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_args.diffusion_name_or_path, subfolder="scheduler")
+            if fsdp is not None and len(fsdp) > 0:
+                self.sana = [sana]
+            else:
+                self.sana = sana
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                sana = self.sana[0]
+            else:
+                sana = self.sana
+        if self.get_sana_vae() is None:
+            sana_vae = build_vae(model_args)
+            if fsdp is not None and len(fsdp) > 0:
+                self.sana_vae = [sana_vae]
+            else:
+                self.sana_vae = sana_vae
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                sana_vae = self.sana_vae[0]
+            else:
+                sana_vae = self.sana_vae
+        if getattr(self, 'diffusion_connector', None) is None:
+            norm = RMSNorm(2304, eps=1e-5, elementwise_affine=True)
+            with torch.no_grad():
+                norm.weight.fill_(math.sqrt(5.5))
+            self.diffusion_connector = nn.Sequential(
+                nn.Linear(self.config.hidden_size, 2304),
+                nn.GELU(approximate="tanh"),
+                nn.Linear(2304, 2304),
+                norm,
+            )
+        else:
+            for p in self.diffusion_connector.parameters():
+                p.requires_grad = True
+        self.config.use_mm_proj = True
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        self.config.mm_patch_merge_type = mm_patch_merge_type
+class blip3oMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images, modalities, pool_scale=None):
+        image_features = self.get_model().get_vision_tower()(images, pool_scale=pool_scale)
+        assert 'tokens' in image_features
+        image_tokens = image_features['tokens']
+        # discrete features for gen related tasks
+        image_tokens = image_tokens + self.config.image_start_token_id
+        image_features = self.get_model().embed_tokens(image_tokens)
+        return {'image_features': image_features, 'image_tokens': image_tokens}
+    def prepare_inputs_labels_for_multimodal(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=None, image_sizes=None):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+        if not isinstance(modalities, list):
+            modalities = [modalities]
+        # random scale for training, but scale 1 for understanding evaluation
+        if self.training:
+            pool_scale = random.choice(vision_tower.pool_scales)
+        else:
+            pool_scale = 1
+        if type(images) is list or images.ndim == 5:
+            if type(images) is list:
+                images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+            images_list = []
+            for image in images:
+                if image.ndim == 4:
+                    images_list.append(image)
+                else:
+                    images_list.append(image.unsqueeze(0))
+            concat_images = torch.cat([image for image in images_list], dim=0)
+            split_sizes = [image.shape[0] for image in images_list]
+            encoded_image_features = self.encode_images(concat_images, modalities, pool_scale=pool_scale)
+            image_tokens = encoded_image_features['image_tokens']
+            encoded_image_features = encoded_image_features['image_features']
+            # This is a list, each element is [num_images, patch * patch, dim]
+            encoded_image_features = torch.split(encoded_image_features, split_sizes)
+            if image_tokens is not None:
+                image_tokens = torch.split(image_tokens, split_sizes)
+            image_features = []
+            for idx, image_feat in enumerate(encoded_image_features):
+                    image_features.append(image_feat)
+            mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+            if mm_patch_merge_type == "flat":
+                image_features = [x.flatten(0, 1) for x in image_features]
+                if image_tokens is not None:
+                    image_tokens = [x.flatten(0, 1) for x in image_tokens]
+            else:
+                raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+        else:
+            image_features = self.encode_images(images, modalities, pool_scale=pool_scale)
+            image_tokens = image_features['image_tokens']
+            image_features = image_features['image_features']
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        breakpoint()
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        # remove the padding using attention_mask -- FIXME
+        _input_ids = input_ids
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        # rank_print("Inserting Images embedding")
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            # rank0_print(num_images)
+            if num_images == 0:
+                # cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                # cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_input_embeds_1[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    try:
+                        cur_image_features = image_features[cur_image_idx]
+                    except IndexError:
+                        rank0_print("Error image_features[cur_image_idx]!")
+                        break
+                    # [Assisant\n<start_image><image><end_image>]
+                    if self.config.image_start_tag_id == cur_labels_noim[i][-1] and image_tokens is not None:
+                        cur_image_tokens = image_tokens[cur_image_idx]
+                        if pool_scale is not None:
+                            pool_token = self.config.scale_start_token_id + pool_scale - 1
+                            pool_token = torch.tensor([pool_token], dtype=torch.long, device=cur_image_tokens.device)
+                            cur_image_tokens = torch.cat([pool_token, cur_image_tokens])
+                            pool_embed = self.get_model().embed_tokens(pool_token)
+                            cur_image_features = torch.cat([pool_embed, cur_image_features])
+                    else:
+                        cur_image_tokens = torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype)
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(cur_image_tokens)
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+        new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        if getattr(self.config, "use_pos_skipping", False) and self.training:
+            position_ids = torch.arange(new_input_embeds.size(1), device=new_input_embeds.device).unsqueeze(0).to(new_input_embeds.device)
+            split_position = random.randint(0, new_input_embeds.size(1))
+            left_add = random.randint(0, self.config.pos_skipping_range)
+            right_add = random.randint(left_add, self.config.pos_skipping_range)
+            position_ids[:, :split_position] += left_add
+            position_ids[:, split_position:] += right_add
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        total_num_new_tokens = 0
+        vocab_size = len(tokenizer)
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.config.image_start_tag_id = tokenizer.convert_tokens_to_ids(DEFAULT_IM_START_TOKEN)
+            self.config.image_end_tag_id = tokenizer.convert_tokens_to_ids(DEFAULT_IM_END_TOKEN)
+            total_num_new_tokens += num_new_tokens
+            self.resize_token_embeddings(vocab_size + total_num_new_tokens)
+        if model_args.num_scale_tokens > 0:
+            scale_tokens = [model_args.scale_token_format.format(str(i)) for i in range(model_args.num_scale_tokens)]
+            num_new_tokens = tokenizer.add_tokens(scale_tokens, special_tokens=False)
+            self.config.scale_start_token_id = tokenizer.convert_tokens_to_ids(scale_tokens[0])
+            self.config.scale_end_token_id = tokenizer.convert_tokens_to_ids(scale_tokens[-1])
+            self.config.num_scale_tokens = model_args.num_scale_tokens
+            total_num_new_tokens += num_new_tokens
+            self.resize_token_embeddings(vocab_size + total_num_new_tokens)
+        if model_args.num_image_tokens > 0:
+            image_tokens = [model_args.image_token_format.format(str(i)) for i in range(model_args.num_image_tokens)]
+            num_new_tokens = tokenizer.add_tokens(image_tokens, special_tokens=False)
+            self.config.image_start_token_id = tokenizer.convert_tokens_to_ids(image_tokens[0])
+            self.config.image_end_token_id = tokenizer.convert_tokens_to_ids(image_tokens[-1])
+            self.config.num_image_tokens = model_args.num_image_tokens
+            total_num_new_tokens += num_new_tokens
+            self.resize_token_embeddings(vocab_size + total_num_new_tokens)
+        if num_new_tokens > 0:
+            self.config.num_new_tokens = num_new_tokens
+            input_embeddings = self.get_input_embeddings().weight.data
+            output_embeddings = self.get_output_embeddings().weight.data
+            input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+            output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+            input_embeddings[-num_new_tokens:] = input_embeddings_avg
+            output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            vision_tower = self.get_vision_tower()
+            if model_args.load_embeddings_from_vision and vision_tower is not None:
+                vision_embeddings = vision_tower.get_embedding()
+                if model_args.num_image_tokens == vision_embeddings.shape[0] and input_embeddings.shape[1] == vision_embeddings.shape[1]:
+                    rank0_print("Load vision embeddings from vision tower.")
+                    input_embeddings[self.config.image_start_token_id:self.config.image_end_token_id+1] = vision_embeddings

blip3o/model/builder.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from transformers import AutoTokenizer
+from blip3o.model import blip3oQwenForCausalLM
+from blip3o.utils import rank0_print
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", torch_dtype="float16", attn_implementation="flash_attention_2", customized_config=None, overwrite_config=None, **kwargs):
+    kwargs["device_map"] = device_map
+    kwargs.pop("multimodal")
+    if customized_config is not None:
+        kwargs["config"] = customized_config
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    from blip3o.model.language_model.blip3o_qwen import blip3oQwenConfig
+    breakpoint()
+    if overwrite_config is not None:
+        blip3o_cfg = blip3oQwenConfig.from_pretrained(model_path)
+        rank0_print(f"Overwriting config with {overwrite_config}")
+        for k, v in overwrite_config.items():
+            setattr(blip3o_cfg, k, v)
+        model = blip3oQwenForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, config=blip3o_cfg, **kwargs)
+    else:
+        model = blip3oQwenForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation=attn_implementation, **kwargs)
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model(device_map=device_map)
+    if device_map != "auto":
+        vision_tower.to(device="cuda", dtype=torch.float16)
+    image_processor = vision_tower.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    elif hasattr(model.config, "max_position_embeddings"):
+        context_len = model.config.max_position_embeddings
+    elif hasattr(model.config, "tokenizer_model_max_length"):
+        context_len = model.config.tokenizer_model_max_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-310.pyc ADDED Viewed

Binary file (6.85 kB). View file

blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc ADDED Viewed

Binary file (13.3 kB). View file

blip3o/model/language_model/__pycache__/blip3o_qwen_grpo.cpython-310.pyc ADDED Viewed

Binary file (7.82 kB). View file

blip3o/model/language_model/__pycache__/blip3o_qwen_grpo.cpython-311.pyc ADDED Viewed

Binary file (15.3 kB). View file

blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-310.pyc ADDED Viewed

Binary file (7.12 kB). View file

blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen.cpython-310.pyc ADDED Viewed

Binary file (6.82 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen_grpo.cpython-310.pyc ADDED Viewed

Binary file (7.79 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen_grpo.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen_inference.cpython-310.pyc ADDED Viewed

Binary file (7.09 kB). View file

blip3o/model/language_model/__pycache__/llava_qwen_inference.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

blip3o/model/language_model/blip3o_qwen.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    Qwen3Config,
+    Qwen3ForCausalLM,
+    Qwen3Model,
+)
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from blip3o.model.blip3o_arch import blip3oMetaForCausalLM, blip3oMetaModel
+from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
+from blip3o.utils import rank0_print
+class blip3oQwenConfig(Qwen3Config):
+    model_type = "blip3o_qwen"
+class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
+    config_class = blip3oQwenConfig
+    def __init__(self, config: Qwen3Config):
+        super(blip3oQwenModel, self).__init__(config)
+class blip3oQwenForCausalLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
+    config_class = blip3oQwenConfig
+    def __init__(self, config):
+        Qwen3ForCausalLM.__init__(self, config)
+        config.model_type = "blip3o_qwen"
+        config.rope_scaling = None
+        self.model = blip3oQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
+        sigmas = self.model.noise_scheduler.sigmas.to(device=device, dtype=dtype)
+        schedule_timesteps = self.model.noise_scheduler.timesteps.to(device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    def mask_drop(self, latents, drop_prob=0.1):
+        if drop_prob <= 0:
+            return latents
+        mask = torch.bernoulli(torch.zeros(latents.shape[0], device=latents.device, dtype=latents.dtype) + drop_prob)
+        while len(mask.shape) < len(latents.shape):
+            mask = mask.unsqueeze(-1)
+        mask = 1 - mask  # need to flip 0 <-> 1
+        return latents * mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        target_images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        modalities: Optional[List[str]] = ["image"],
+        dpo_forward: Optional[bool] = False,
+        cache_position=None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = self.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if target_images is not None:
+            vae = self.model.get_sana_vae()
+            latents = vae.encode(target_images).latent
+            if "shift_factor" in vae.config and vae.config.shift_factor is not None:
+                latents = latents - vae.config.shift_factor
+            latents = latents * vae.config.scaling_factor
+            noise = torch.randn_like(latents, device=latents.device)
+            weighting_scheme = "uniform"
+            u = compute_density_for_timestep_sampling(
+                weighting_scheme=weighting_scheme,
+                batch_size=latents.shape[0],
+                logit_mean=0.0,
+                logit_std=1.0,
+                mode_scale=1.29,
+            )
+            indices = (u * self.model.noise_scheduler.config.num_train_timesteps).long()
+            timesteps = self.model.noise_scheduler.timesteps[indices].to(device=latents.device)
+            sigmas = self.get_sigmas(timesteps, latents.device, n_dim=latents.ndim, dtype=latents.dtype)
+            noisy_latents = (1.0 - sigmas) * latents + sigmas * noise
+            sana = self.model.get_sana()
+            start_pos = (labels == self.config.image_start_tag_id).float().argmax(dim=1)
+            end_pos   = (labels == self.config.image_end_tag_id).float().argmax(dim=1)
+            breakpoint()
+            selected_hidden_states = []
+            for b in range(hidden_states.size(0)):
+                start = start_pos[b].item() + 1
+                end = end_pos[b].item()
+                hidden_states_filter = hidden_states[b, start:end, :]
+                if hidden_states_filter.size(1) != 730:
+                    hidden_states_filter = hidden_states[b, -730:, :]
+                selected_hidden_states.append(hidden_states_filter)
+            selected_hidden_states = torch.stack(selected_hidden_states, dim=0)
+            diffusion_pred = sana(
+                hidden_states=noisy_latents,
+                timestep=timesteps,
+                encoder_hidden_states=self.model.diffusion_connector(self.mask_drop(selected_hidden_states)),
+                encoder_attention_mask=None,
+            ).sample
+            target = noise - latents
+            weighting = compute_loss_weighting_for_sd3(weighting_scheme=weighting_scheme, sigmas=sigmas)
+            diff_loss = torch.mean(
+                (weighting.float() * (diffusion_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                1,
+            )
+            diff_loss = diff_loss.mean()
+            rank0_print(f" Cross-entropy loss {loss}, Diffusion loss {diff_loss} ")
+            loss += diff_loss
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("blip3o_qwen", blip3oQwenConfig)
+AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForCausalLM)

blip3o/model/language_model/blip3o_qwen_grpo.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    Qwen3Config,
+    Qwen3ForCausalLM,
+    Qwen3Model,
+)
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from blip3o.model.blip3o_arch import blip3oMetaForCausalLM, blip3oMetaModel
+from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers import DDPMScheduler, DDIMScheduler, LCMScheduler, FlowMatchEulerDiscreteScheduler, DPMSolverMultistepScheduler
+import numpy as np
+from tqdm import tqdm
+import PIL
+from blip3o.utils import rank0_print
+def numpy_to_pil(images: np.ndarray):
+    """
+    Convert a NumPy array of shape (batch, height, width, channels) to a list of PIL Images.
+    """
+    pil_images = []
+    for img in images:
+        img_uint8 = (img * 255).round().astype("uint8")
+        if img_uint8.shape[2] == 1:
+            img_uint8 = img_uint8[..., 0]
+        pil_images.append(PIL.Image.fromarray(img_uint8))
+    return pil_images
+class blip3oQwenConfig(Qwen3Config):
+    model_type = "blip3o_qwen_grpo"
+class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
+    config_class = blip3oQwenConfig
+    def __init__(self, config: Qwen3Config):
+        super(blip3oQwenModel, self).__init__(config)
+class blip3oQwenForGRPOLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
+    config_class = blip3oQwenConfig
+    def __init__(self, config):
+        Qwen3ForCausalLM.__init__(self, config)
+        config.model_type = "blip3o_qwen"
+        config.rope_scaling = None
+        self.model = blip3oQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
+        sigmas = self.model.noise_scheduler.sigmas.to(device=device, dtype=dtype)
+        schedule_timesteps = self.model.noise_scheduler.timesteps.to(device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    def mask_drop(self, latents, drop_prob=0.1):
+        if drop_prob <= 0:
+            return latents
+        mask = torch.bernoulli(torch.zeros(latents.shape[0], device=latents.device, dtype=latents.dtype) + drop_prob)
+        while len(mask.shape) < len(latents.shape):
+            mask = mask.unsqueeze(-1)
+        mask = 1 - mask  # need to flip 0 <-> 1
+        return latents * mask
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs)
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+    @torch.no_grad()
+    def decode_latents(self, latents, normalize=True, return_tensor=False):
+        if self.model.sana_vae is not None:
+            latents = latents / self.model.sana_vae.config.scaling_factor
+            if "shift_factor" in self.model.sana_vae.config and self.model.sana_vae.config.shift_factor is not None:
+                latents = latents + self.model.sana_vae.config.shift_factor
+            samples = self.model.sana_vae.decode(latents).sample
+        else:
+            samples = latents
+        if normalize:
+            samples = (samples / 2 + 0.5).clamp(0, 1)
+        else:
+            samples = samples.clamp(-1, 1)
+        if return_tensor:
+            return samples
+        samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
+        samples = numpy_to_pil(samples)
+        return samples
+    @torch.no_grad()
+    def generate_images(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: Optional[torch.Tensor] = None,
+        temperature: Optional[torch.Tensor] = None,
+        top_p: Optional[torch.Tensor] = None,
+        top_k: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        guidance_scale: float = 2.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 30,
+        num_images_per_prompt: int = 1,
+        return_tensor=False,
+        enable_progress_bar=False,
+        **kwargs,
+    ):
+        position_ids = kwargs.pop("position_ids", None)
+        # attention_mask = (inputs != -100).long()
+        gen_ids = super(blip3oQwenForGRPOLM, self).generate(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=1.0,
+            attention_mask=attention_mask,
+        )
+        # breakpoint()
+        with torch.no_grad():
+            outs = self.model(
+                input_ids = gen_ids,
+                output_hidden_states = True,
+                return_dict = True,
+            )
+        hidden_states = outs.hidden_states[-1]
+        start_pos = (gen_ids == self.config.image_start_tag_id).float().argmax(dim=1)
+        end_pos   = (gen_ids == self.config.image_end_tag_id).float().argmax(dim=1)
+        selected_hidden_states = []
+        for b in range(hidden_states.size(0)):
+            start = start_pos[b].item() + 1
+            # end = end_pos[b].item()
+            selected_hidden_states.append(hidden_states[b, start:, :])
+        pred_latent = torch.stack(selected_hidden_states, dim=0)
+        img_hidden_states_null = torch.zeros_like(pred_latent)
+        pred_latent = torch.cat([img_hidden_states_null, pred_latent], 0)
+        ## sample images from here
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        bsz = len(pred_latent) // 2
+        # latent_size = self.config.input_size
+        latent_size = 32
+        latent_channels = self.model.sana.config.in_channels
+        latents = randn_tensor(
+            shape=(bsz * num_images_per_prompt, latent_channels, latent_size, latent_size),
+            generator=None,
+            device=device,
+            dtype=torch.bfloat16,
+        )
+        # set step values
+        if isinstance(self.model.noise_scheduler, FlowMatchEulerDiscreteScheduler):
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            self.model.noise_scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
+        else:
+            self.model.noise_scheduler.set_timesteps(num_inference_steps)
+        # pred_latent = torch.cat([pred_latent] * 2)
+        # Convert to float32 before saving
+        for t in tqdm(self.model.noise_scheduler.timesteps, desc="Sampling images", disable=not enable_progress_bar):
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = latent_model_input.to(pred_latent.dtype)
+            if hasattr(self.model.noise_scheduler.timesteps, "scale_model_input"):
+                latent_model_input = self.model.noise_scheduler.scale_model_input(latent_model_input, t)
+            # predict noise model_output
+            noise_pred = self.model.sana(
+                hidden_states=latent_model_input,
+                encoder_hidden_states=self.model.diffusion_connector(pred_latent),
+                timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latents.device),
+                encoder_attention_mask=None
+            ).sample
+            noise_pred_uncond, noise_pred= noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+            # compute previous image: x_t -> x_t-1
+            latents = self.model.noise_scheduler.step(noise_pred, t, latents).prev_sample
+        samples = self.decode_latents(latents.to(self.model.sana_vae.dtype) if self.model.sana_vae is not None else latents, return_tensor=return_tensor)
+        return gen_ids, samples
+AutoConfig.register("blip3o_qwen_grpo", blip3oQwenConfig)
+AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForGRPOLM)

blip3o/model/language_model/blip3o_qwen_inference.py ADDED Viewed

	@@ -0,0 +1,241 @@

+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    Qwen3Config,
+    Qwen3ForCausalLM,
+    Qwen3Model,
+)
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from blip3o.model.blip3o_arch import blip3oMetaForCausalLM, blip3oMetaModel
+from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers import DDPMScheduler, DDIMScheduler, LCMScheduler, FlowMatchEulerDiscreteScheduler, DPMSolverMultistepScheduler
+import numpy as np
+from tqdm import tqdm
+import PIL
+def numpy_to_pil(images: np.ndarray):
+    """
+    Convert a NumPy array of shape (batch, height, width, channels) to a list of PIL Images.
+    """
+    pil_images = []
+    for img in images:
+        img_uint8 = (img * 255).round().astype("uint8")
+        if img_uint8.shape[2] == 1:
+            img_uint8 = img_uint8[..., 0]
+        pil_images.append(PIL.Image.fromarray(img_uint8))
+    return pil_images
+class blip3oQwenConfig(Qwen3Config):
+    model_type = "blip3o_qwen_inference"
+class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
+    config_class = blip3oQwenConfig
+    def __init__(self, config: Qwen3Config):
+        super(blip3oQwenModel, self).__init__(config)
+class blip3oQwenForInferenceLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
+    config_class = blip3oQwenConfig
+    def __init__(self, config):
+        Qwen3ForCausalLM.__init__(self, config)
+        config.model_type = "blip3o_qwen"
+        config.rope_scaling = None
+        self.model = blip3oQwenModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
+        sigmas = self.model.noise_scheduler.sigmas.to(device=device, dtype=dtype)
+        schedule_timesteps = self.model.noise_scheduler.timesteps.to(device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(inputs, position_ids, attention_mask, None, None, images, modalities, image_sizes=image_sizes)
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs)
+    @torch.no_grad()
+    def decode_latents(self, latents, normalize=True, return_tensor=False):
+        if self.model.sana_vae is not None:
+            latents = latents / self.model.sana_vae.config.scaling_factor
+            if "shift_factor" in self.model.sana_vae.config and self.model.sana_vae.config.shift_factor is not None:
+                latents = latents + self.model.sana_vae.config.shift_factor
+            samples = self.model.sana_vae.decode(latents).sample
+        else:
+            samples = latents
+        if normalize:
+            samples = (samples / 2 + 0.5).clamp(0, 1)
+        else:
+            samples = samples.clamp(-1, 1)
+        if return_tensor:
+            return samples
+        samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
+        samples = numpy_to_pil(samples)
+        return samples
+    @torch.no_grad()
+    def generate_images(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: Optional[torch.Tensor] = None,
+        temperature: Optional[torch.Tensor] = None,
+        top_p: Optional[torch.Tensor] = None,
+        top_k: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        guidance_scale: float = 2.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 30,
+        num_images_per_prompt: int = 1,
+        return_tensor=False,
+        enable_progress_bar=False,
+        **kwargs,
+    ):
+        position_ids = kwargs.pop("position_ids", None)
+        # attention_mask = (inputs != -100).long()
+        gen_ids = super(blip3oQwenForInferenceLM, self).generate(
+            inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            attention_mask=attention_mask,
+            top_p=top_p,
+            top_k=top_k)
+        # breakpoint()
+        with torch.no_grad():
+            outs = self.model(
+                input_ids = gen_ids,
+                output_hidden_states = True,
+                return_dict = True,
+            )
+        hidden_states = outs.hidden_states[-1]
+        start_pos = (gen_ids == self.config.image_start_tag_id).float().argmax(dim=1)
+        end_pos   = (gen_ids == self.config.image_end_tag_id).float().argmax(dim=1)
+        selected_hidden_states = []
+        for b in range(hidden_states.size(0)):
+            start = start_pos[b].item() + 1
+            # end = end_pos[b].item()
+            selected_hidden_states.append(hidden_states[b, start:, :])
+        pred_latent = torch.stack(selected_hidden_states, dim=0)
+        img_hidden_states_null = torch.zeros_like(pred_latent)
+        pred_latent = torch.cat([img_hidden_states_null, pred_latent], 0)
+        ## sample images from here
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        bsz = len(pred_latent) // 2
+        # latent_size = self.config.input_size
+        latent_size = 32
+        latent_channels = self.model.sana.config.in_channels
+        latents = randn_tensor(
+            shape=(bsz * num_images_per_prompt, latent_channels, latent_size, latent_size),
+            generator=None,
+            device=device,
+            dtype=torch.bfloat16,
+        )
+        # set step values
+        if isinstance(self.model.noise_scheduler, FlowMatchEulerDiscreteScheduler):
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            self.model.noise_scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
+        else:
+            self.model.noise_scheduler.set_timesteps(num_inference_steps)
+        # pred_latent = torch.cat([pred_latent] * 2)
+        # Convert to float32 before saving
+        for t in tqdm(self.model.noise_scheduler.timesteps, desc="Sampling images", disable=not enable_progress_bar):
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = latent_model_input.to(pred_latent.dtype)
+            if hasattr(self.model.noise_scheduler.timesteps, "scale_model_input"):
+                latent_model_input = self.model.noise_scheduler.scale_model_input(latent_model_input, t)
+            # predict noise model_output
+            noise_pred = self.model.sana(
+                hidden_states=latent_model_input,
+                encoder_hidden_states=self.model.diffusion_connector(pred_latent),
+                timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latents.device),
+                encoder_attention_mask=None
+            ).sample
+            noise_pred_uncond, noise_pred= noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+            # compute previous image: x_t -> x_t-1
+            latents = self.model.noise_scheduler.step(noise_pred, t, latents).prev_sample
+        samples = self.decode_latents(latents.to(self.model.sana_vae.dtype) if self.model.sana_vae is not None else latents, return_tensor=return_tensor)
+        return gen_ids, samples
+AutoConfig.register("blip3o_qwen_inference", blip3oQwenConfig)
+AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForInferenceLM)

blip3o/model/multimodal_decoder/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (661 Bytes). View file

blip3o/model/multimodal_decoder/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (954 Bytes). View file

blip3o/model/multimodal_decoder/__pycache__/ta_tok_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.71 kB). View file

blip3o/model/multimodal_decoder/__pycache__/ta_tok_encoder.cpython-311.pyc ADDED Viewed

Binary file (6.73 kB). View file

blip3o/model/multimodal_decoder/builder.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from diffusers import AutoencoderDC, SanaTransformer2DModel
+import torch
+def build_sana(vision_tower_cfg, **kwargs):
+    sana = SanaTransformer2DModel.from_pretrained(vision_tower_cfg.diffusion_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16)
+    return sana
+def build_vae(vision_tower_cfg, **kwargs):
+    vae = AutoencoderDC.from_pretrained(vision_tower_cfg.diffusion_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16)
+    return vae

blip3o/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (472 Bytes). View file

blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc ADDED Viewed

Binary file (639 Bytes). View file

blip3o/model/multimodal_encoder/__pycache__/ta_tok_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.72 kB). View file

blip3o/model/multimodal_encoder/__pycache__/ta_tok_encoder.cpython-311.pyc ADDED Viewed

Binary file (6.74 kB). View file