1f commited on Jun 7, 2025

Commit

313b9b3

verified ·

1 Parent(s): e2cfb48

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

r1-a/response_generation/minicpm/MiniCPM-o/assets/modelscope_logo.png +0 -0
r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/__init__.py +1 -0
r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/omnilmm.py +457 -0
r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/resampler.py +171 -0
r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/utils.py +555 -0
r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/train/train_utils.py +153 -0
r1-a/response_generation/minicpm/MiniCPM-o/quantize/bnb_quantize.py +81 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/chatbot_web_demo_o2.6.py +552 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/model_server.py +936 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/vad_utils.py +301 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.env.development +0 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.env.production +0 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.eslintrc-auto-import.json +359 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.eslintrc.cjs +26 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo.py +264 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_2.5.py +256 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_2.6.py +557 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit-2_5.py +109 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit-minicpmv2_6.py +271 -0
r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit.py +99 -0

r1-a/response_generation/minicpm/MiniCPM-o/assets/modelscope_logo.png ADDED Viewed

r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .omnilmm import OmniLMMForCausalLM

r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/omnilmm.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import gc
+import math
+import timm
+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from typing import List, Optional, Tuple, Union
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import MistralForCausalLM, MistralModel, MistralConfig
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from omnilmm.model.utils import build_transform
+from omnilmm.model.resampler import Resampler
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+class OmniLMMConfig(MistralConfig):
+    model_type = "omnilmm"
+class Identity(torch.nn.Identity):
+    def forward(self, input: Tensor, **kwargs) -> Tensor:
+        return super().forward(input)
+def create_vision_module(config):
+    vision_tower = timm.create_model('eva02_enormous_patch14_clip_224.laion2b_plus',
+                                     pretrained=False,
+                                     num_classes=0,
+                                     dynamic_img_size=True,
+                                     dynamic_img_pad=True)
+    if isinstance(vision_tower, timm.models.VisionTransformer):
+        if vision_tower.attn_pool is not None:
+            vision_tower.attn_pool = Identity()
+    # use 2nd last layer's output
+    vision_tower.blocks[-1] = Identity()
+    embed_dim = config.hidden_size
+    resampler = Resampler(
+        grid_size=int(math.sqrt(config.num_query)),
+        embed_dim=embed_dim,
+        num_heads=embed_dim // 128,
+        kv_dim=vision_tower.embed_dim,
+    )
+    return vision_tower, resampler
+class OmniLMMModel(MistralModel):
+    config_class = OmniLMMConfig
+    def __init__(self, config: OmniLMMConfig, mm_vision_tower=None, mm_hidden_size=None, tune_clip=True):
+        super(OmniLMMModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            vision_tower, resampler = create_vision_module(config)
+            # print(__file__, 'skip loading vision tower weights')
+            # HACK: for FSDP
+            self.vision_tower = [vision_tower]
+            self.resampler = resampler
+            if tune_clip:
+                self.vision_tower = self.vision_tower[0]
+        self.vision_config = lambda x: None
+    def initialize_vision_modules(self, vision_tower, no_randaug, num_query, image_size, tune_clip=False):
+        self.config.mm_vision_tower = vision_tower
+        self.config.use_mm_proj = True
+        self.config.num_query = num_query
+        self.config.image_size = image_size
+        if not hasattr(self, 'vision_tower'):
+            vision_tower, resampler = create_vision_module(self.config)
+            state_dict = torch.load(
+                '/tt/data/public/multimodal/multimodal_model_ckpts/timm/eva02_enormous_patch14_clip_224.laion2b_plus.pt')
+            vision_tower.load_state_dict(state_dict, strict=False)
+            del state_dict
+            gc.collect()
+        else:
+            if isinstance(self.vision_tower, list):
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            resampler = self.resampler
+        self.vision_tower = vision_tower if tune_clip else [vision_tower]
+        self.resampler = resampler
+        train_img_transform = build_transform(
+            is_train=True, randaug=not no_randaug, input_size=self.config.image_size, std_mode='OPENAI_CLIP')
+        eval_img_transform = build_transform(
+            is_train=False, input_size=self.config.image_size, std_mode='OPENAI_CLIP')
+        return dict(
+            image_processor=(train_img_transform, eval_img_transform),
+            image_token_len=num_query,
+            vision_config=self.vision_config
+        )
+    def get_vision_embedding(self, pixel_values):
+        if isinstance(self.vision_tower, list):
+            vision_tower = self.vision_tower[0]  # HACK: for FSDP
+        else:
+            vision_tower = self.vision_tower
+        dtype = vision_tower.pos_embed.data.dtype
+        vision_embedding = vision_tower.forward_features(
+            pixel_values.type(dtype))
+        if hasattr(vision_tower, 'num_prefix_tokens') and vision_tower.num_prefix_tokens > 0:
+            vision_embedding = vision_embedding[:,
+                                                vision_tower.num_prefix_tokens:]
+        res = self.resampler(vision_embedding)
+        return res
+    def get_vllm_embedding(self, data):
+        if 'vision_hidden_states' not in data:
+            pixel_values_list = data['pixel_values']
+            vision_hidden_states = []
+            for pixel_values in pixel_values_list:
+                if len(pixel_values) > 0:
+                    vision_hidden_states.append(self.get_vision_embedding(pixel_values.unsqueeze(0))[0])
+                else:
+                    vision_hidden_states.append([])
+        else:
+            vision_hidden_states = data['vision_hidden_states']
+        #vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
+        inputs_embeds = self.embed_tokens(data['input_ids'])
+        vision_hidden_states = [i.type(inputs_embeds.dtype)
+            if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
+        ]
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+        new_input_embeds = []
+        cur_image_idx = 0
+        for cur_input_ids, cur_input_embeds in zip(data['input_ids'], inputs_embeds):
+            if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
+                # multimodal LLM, but the current sample is not multimodal
+                cur_input_embeds = cur_input_embeds + (0. * dummy_image_features).sum()
+                new_input_embeds.append(cur_input_embeds)
+                continue
+            if self.vision_config.use_im_start_end:
+                cur_image_features = vision_hidden_states[cur_image_idx]
+                num_patches = cur_image_features.shape[0]
+                if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
+                    raise ValueError(
+                        "The number of image start tokens and image end tokens should be the same.")
+                image_start_tokens = torch.where(
+                    cur_input_ids == self.vision_config.im_start_token)[0]
+                for image_start_token_pos in image_start_tokens:
+                    cur_image_features = vision_hidden_states[cur_image_idx].to(
+                        device=cur_input_embeds.device)
+                    num_patches = cur_image_features.shape[0]
+                    if cur_input_ids[image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                        raise ValueError(
+                            "The image end token should follow the image start token.")
+                    if orig_embeds_params is not None:
+                        cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
+                                                         cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                    else:
+                        cur_new_input_embeds = torch.cat(
+                            (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                    cur_image_idx += 1
+                new_input_embeds.append(cur_new_input_embeds)
+            else:
+                raise NotImplementedError
+        inputs_embeds = torch.stack(new_input_embeds, dim=0)
+        return inputs_embeds, vision_hidden_states
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # HACK: replace back original embeddings for LLaVA pretraining
+        orig_embeds_params = getattr(self, 'orig_embeds_params', None)
+        if inputs_embeds is None and past_key_values is None:
+          inputs_embeds = self.embed_tokens(input_ids)
+          vision_tower = getattr(self, 'vision_tower', None)
+          if vision_tower is not None and (input_ids.shape[1] != 1 or self.training) and images is not None:
+            if type(images) is list:
+                image_features = []
+                for image in images:
+                    image_forward_out = self.get_vision_embedding(image.unsqueeze(0))[
+                        0]
+                    image_features.append(image_forward_out)
+            else:
+                image_features = self.get_vision_embedding(images)
+            dummy_image_features = torch.zeros(
+                self.config.num_query,
+                self.config.hidden_size,
+                device=inputs_embeds.device,
+                dtype=inputs_embeds.dtype)
+            new_input_embeds = []
+            cur_image_idx = 0
+            for cur_input_ids, cur_input_embeds in zip(input_ids, inputs_embeds):
+                if (cur_input_ids == self.vision_config.im_patch_token).sum() == 0:
+                    # multimodal LLM, but the current sample is not multimodal
+                    cur_input_embeds = cur_input_embeds + \
+                        (0. * dummy_image_features).sum()
+                    new_input_embeds.append(cur_input_embeds)
+                    continue
+                if self.vision_config.use_im_start_end:
+                    cur_image_features = image_features[cur_image_idx]
+                    num_patches = cur_image_features.shape[0]
+                    if (cur_input_ids == self.vision_config.im_start_token).sum() != (cur_input_ids == self.vision_config.im_end_token).sum():
+                        raise ValueError(
+                            "The number of image start tokens and image end tokens should be the same.")
+                    image_start_tokens = torch.where(
+                        cur_input_ids == self.vision_config.im_start_token)[0]
+                    for image_start_token_pos in image_start_tokens:
+                        cur_image_features = image_features[cur_image_idx].to(
+                            device=cur_input_embeds.device)
+                        num_patches = cur_image_features.shape[0]
+                        if cur_input_ids[image_start_token_pos + num_patches + 1] != self.vision_config.im_end_token:
+                            raise ValueError(
+                                "The image end token should follow the image start token.")
+                        if orig_embeds_params is not None:
+                            cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos].detach(), cur_input_embeds[image_start_token_pos:image_start_token_pos+1], cur_image_features,
+                                                             cur_input_embeds[image_start_token_pos + num_patches + 1:image_start_token_pos + num_patches + 2], cur_input_embeds[image_start_token_pos + num_patches + 2:].detach()), dim=0)
+                        else:
+                            cur_new_input_embeds = torch.cat(
+                                (cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
+                        cur_image_idx += 1
+                    new_input_embeds.append(cur_new_input_embeds)
+                else:
+                    raise NotImplementedError
+            inputs_embeds = torch.stack(new_input_embeds, dim=0)
+            input_ids = None
+        return super(OmniLMMModel, self).forward(
+            input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds, use_cache=use_cache,
+            output_attentions=output_attentions, output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs
+        )
+class OmniLMMForCausalLM(MistralForCausalLM):
+    config_class = OmniLMMConfig
+    def __init__(self, config, mm_vision_tower=None, tune_clip=True):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = OmniLMMModel(
+            config, mm_vision_tower=mm_vision_tower, tune_clip=tune_clip)
+        self.lm_head = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # print(f'@@@ At forward, labels: {labels.shape}-{labels}', flush=True)
+        # print(f'@@@ At forward, input_ids: {input_ids.shape}-{input_ids}', flush=True)
+        # print(f'@@@ At forward, input_ids: {attention_mask.shape}-{attention_mask}', flush=True)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            images=images,
+            **kwargs
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    # TODO could be removed for generate_vllm()
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+    def generate_vllm(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: Optional[torch.FloatTensor] = None,
+        vision_hidden_states=None,
+        return_vision_hidden_states=False,
+        **kwargs
+    ):
+        model_inputs = {'input_ids': input_ids}
+        if vision_hidden_states is None:
+            model_inputs['pixel_values'] = images
+        else:
+            model_inputs['vision_hidden_states'] = vision_hidden_states
+        with torch.inference_mode():
+            inputs_embeds, vision_hidden_states = self.model.get_vllm_embedding(model_inputs)
+            result = self.generate(
+                inputs_embeds=inputs_embeds,
+                **kwargs
+            )
+        if return_vision_hidden_states:
+            return result, vision_hidden_states
+        return result
+    def initialize_vision_tokenizer(self, mm_use_im_start_end, tokenizer, device,
+                                    tune_mm_mlp_adapter=False):
+        self.model.vision_config.use_im_start_end = mm_use_im_start_end
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        self.resize_token_embeddings(len(tokenizer))
+        if mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            self.model.vision_config.im_start_token, self.model.vision_config.im_end_token = tokenizer.convert_tokens_to_ids(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            # for new sft data
+            num_new_tokens = tokenizer.add_tokens(
+                ['<box>', '</box>', '<ref>', '</ref>', '<quad>', '</quad>'], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if tune_mm_mlp_adapter:
+                self.model.orig_embeds_params = [
+                    self.get_input_embeddings().weight.data.clone().to(device=device)]
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+        self.model.vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
+            [DEFAULT_IMAGE_PATCH_TOKEN])[0]
+        print(f'Tokenizer: {tokenizer}\n patch_token_id: {self.model.vision_config.im_patch_token}, visoin_config: {self.model.vision_config}', flush=True)
+        # exit()
+AutoConfig.register("omnilmm", OmniLMMConfig)
+AutoModelForCausalLM.register(OmniLMMConfig, OmniLMMForCausalLM)

r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/resampler.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List, Union
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate(
+            [np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6)
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(
+                embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter(
+            (embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim))
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x, attn_mask=None):
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        # print((self._repeat(q, N) + self.pos_embed.unsqueeze(1)).dtype, (x + pos_embed.unsqueeze(1)).dtype, x.dtype)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        x = out.permute(1, 0, 2)
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)

r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/model/utils.py ADDED Viewed

	@@ -0,0 +1,555 @@

+from torchvision import transforms
+from timm.data.transforms import RandomResizedCropAndInterpolation
+from timm.data.constants import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from transformers import AutoConfig
+from PIL import Image
+from io import BytesIO
+import torch.distributed as dist
+import numpy as np
+import pickle
+import base64
+import cv2
+import os
+import torch
+from transformers import AutoConfig, StoppingCriteria
+try:
+    from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+except ImportError:
+    OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+    OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and cfg.model_type != 'llava':
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input(
+            "Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.tokenizer = tokenizer
+        self.start_len = None
+        self.input_ids = input_ids
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.start_len is None:
+            self.start_len = self.input_ids.shape[1]
+        else:
+            outputs = self.tokenizer.batch_decode(
+                output_ids[:, self.start_len:], skip_special_tokens=True)[0]
+            for keyword in self.keywords:
+                if keyword in outputs:
+                    return True
+        return False
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if 'llava' in config and cfg.model_type != 'llava':
+        print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
+        print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
+        confirm = input(
+            "Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = 'LlavaLlamaForCausalLM'
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)
+# aug functions
+def identity_func(img):
+    return img
+def autocontrast_func(img, cutoff=0):
+    '''
+        same output as PIL.ImageOps.autocontrast
+    '''
+    n_bins = 256
+    def tune_channel(ch):
+        n = ch.size
+        cut = cutoff * n // 100
+        if cut == 0:
+            high, low = ch.max(), ch.min()
+        else:
+            hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+            low = np.argwhere(np.cumsum(hist) > cut)
+            low = 0 if low.shape[0] == 0 else low[0]
+            high = np.argwhere(np.cumsum(hist[::-1]) > cut)
+            high = n_bins - 1 if high.shape[0] == 0 else n_bins - 1 - high[0]
+        if high <= low:
+            table = np.arange(n_bins)
+        else:
+            scale = (n_bins - 1) / (high - low)
+            table = np.arange(n_bins) * scale - low * scale
+            table[table < 0] = 0
+            table[table > n_bins - 1] = n_bins - 1
+        table = table.clip(0, 255).astype(np.uint8)
+        return table[ch]
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+def equalize_func(img):
+    '''
+        same output as PIL.ImageOps.equalize
+        PIL's implementation is different from cv2.equalize
+    '''
+    n_bins = 256
+    def tune_channel(ch):
+        hist = cv2.calcHist([ch], [0], None, [n_bins], [0, n_bins])
+        non_zero_hist = hist[hist != 0].reshape(-1)
+        step = np.sum(non_zero_hist[:-1]) // (n_bins - 1)
+        if step == 0:
+            return ch
+        n = np.empty_like(hist)
+        n[0] = step // 2
+        n[1:] = hist[:-1]
+        table = (np.cumsum(n) // step).clip(0, 255).astype(np.uint8)
+        return table[ch]
+    channels = [tune_channel(ch) for ch in cv2.split(img)]
+    out = cv2.merge(channels)
+    return out
+def rotate_func(img, degree, fill=(0, 0, 0)):
+    '''
+    like PIL, rotate by degree, not radians
+    '''
+    H, W = img.shape[0], img.shape[1]
+    center = W / 2, H / 2
+    M = cv2.getRotationMatrix2D(center, degree, 1)
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill)
+    return out
+def solarize_func(img, thresh=128):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    table = np.array([el if el < thresh else 255 - el for el in range(256)])
+    table = table.clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def color_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Color
+    '''
+    # implementation according to PIL definition, quite slow
+    #  degenerate = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)[:, :, np.newaxis]
+    #  out = blend(degenerate, img, factor)
+    #  M = (
+    #      np.eye(3) * factor
+    #      + np.float32([0.114, 0.587, 0.299]).reshape(3, 1) * (1. - factor)
+    #  )[np.newaxis, np.newaxis, :]
+    M = (
+        np.float32([
+            [0.886, -0.114, -0.114],
+            [-0.587, 0.413, -0.587],
+            [-0.299, -0.299, 0.701]]) * factor
+        + np.float32([[0.114], [0.587], [0.299]])
+    )
+    out = np.matmul(img, M).clip(0, 255).astype(np.uint8)
+    return out
+def contrast_func(img, factor):
+    """
+        same output as PIL.ImageEnhance.Contrast
+    """
+    mean = np.sum(np.mean(img, axis=(0, 1)) * np.array([0.114, 0.587, 0.299]))
+    table = np.array([(
+        el - mean) * factor + mean
+        for el in range(256)
+    ]).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def brightness_func(img, factor):
+    '''
+        same output as PIL.ImageEnhance.Contrast
+    '''
+    table = (np.arange(256, dtype=np.float32) *
+             factor).clip(0, 255).astype(np.uint8)
+    out = table[img]
+    return out
+def sharpness_func(img, factor):
+    '''
+    The differences the this result and PIL are all on the 4 boundaries, the center
+    areas are same
+    '''
+    kernel = np.ones((3, 3), dtype=np.float32)
+    kernel[1][1] = 5
+    kernel /= 13
+    degenerate = cv2.filter2D(img, -1, kernel)
+    if factor == 0.0:
+        out = degenerate
+    elif factor == 1.0:
+        out = img
+    else:
+        out = img.astype(np.float32)
+        degenerate = degenerate.astype(np.float32)[1:-1, 1:-1, :]
+        out[1:-1, 1:-1, :] = degenerate + factor * \
+            (out[1:-1, 1:-1, :] - degenerate)
+        out = out.astype(np.uint8)
+    return out
+def shear_x_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, factor, 0], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def translate_x_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, -offset], [0, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def translate_y_func(img, offset, fill=(0, 0, 0)):
+    '''
+        same output as PIL.Image.transform
+    '''
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [0, 1, -offset]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def posterize_func(img, bits):
+    '''
+        same output as PIL.ImageOps.posterize
+    '''
+    out = np.bitwise_and(img, np.uint8(255 << (8 - bits)))
+    return out
+def shear_y_func(img, factor, fill=(0, 0, 0)):
+    H, W = img.shape[0], img.shape[1]
+    M = np.float32([[1, 0, 0], [factor, 1, 0]])
+    out = cv2.warpAffine(img, M, (W, H), borderValue=fill,
+                         flags=cv2.INTER_LINEAR).astype(np.uint8)
+    return out
+def cutout_func(img, pad_size, replace=(0, 0, 0)):
+    replace = np.array(replace, dtype=np.uint8)
+    H, W = img.shape[0], img.shape[1]
+    rh, rw = np.random.random(2)
+    pad_size = pad_size // 2
+    ch, cw = int(rh * H), int(rw * W)
+    x1, x2 = max(ch - pad_size, 0), min(ch + pad_size, H)
+    y1, y2 = max(cw - pad_size, 0), min(cw + pad_size, W)
+    out = img.copy()
+    out[x1:x2, y1:y2, :] = replace
+    return out
+# level to args
+def enhance_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        return ((level / MAX_LEVEL) * 1.8 + 0.1,)
+    return level_to_args
+def shear_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 0.3
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+    return level_to_args
+def translate_level_to_args(translate_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * float(translate_const)
+        if np.random.random() > 0.5:
+            level = -level
+        return (level, replace_value)
+    return level_to_args
+def cutout_level_to_args(cutout_const, MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * cutout_const)
+        return (level, replace_value)
+    return level_to_args
+def solarize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 256)
+        return (level, )
+    return level_to_args
+def none_level_to_args(level):
+    return ()
+def posterize_level_to_args(MAX_LEVEL):
+    def level_to_args(level):
+        level = int((level / MAX_LEVEL) * 4)
+        return (level, )
+    return level_to_args
+def rotate_level_to_args(MAX_LEVEL, replace_value):
+    def level_to_args(level):
+        level = (level / MAX_LEVEL) * 30
+        if np.random.random() < 0.5:
+            level = -level
+        return (level, replace_value)
+    return level_to_args
+func_dict = {
+    'Identity': identity_func,
+    'AutoContrast': autocontrast_func,
+    'Equalize': equalize_func,
+    'Rotate': rotate_func,
+    'Solarize': solarize_func,
+    'Color': color_func,
+    'Contrast': contrast_func,
+    'Brightness': brightness_func,
+    'Sharpness': sharpness_func,
+    'ShearX': shear_x_func,
+    'TranslateX': translate_x_func,
+    'TranslateY': translate_y_func,
+    'Posterize': posterize_func,
+    'ShearY': shear_y_func,
+}
+translate_const = 10
+MAX_LEVEL = 10
+replace_value = (128, 128, 128)
+arg_dict = {
+    'Identity': none_level_to_args,
+    'AutoContrast': none_level_to_args,
+    'Equalize': none_level_to_args,
+    'Rotate': rotate_level_to_args(MAX_LEVEL, replace_value),
+    'Solarize': solarize_level_to_args(MAX_LEVEL),
+    'Color': enhance_level_to_args(MAX_LEVEL),
+    'Contrast': enhance_level_to_args(MAX_LEVEL),
+    'Brightness': enhance_level_to_args(MAX_LEVEL),
+    'Sharpness': enhance_level_to_args(MAX_LEVEL),
+    'ShearX': shear_level_to_args(MAX_LEVEL, replace_value),
+    'TranslateX': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'TranslateY': translate_level_to_args(
+        translate_const, MAX_LEVEL, replace_value
+    ),
+    'Posterize': posterize_level_to_args(MAX_LEVEL),
+    'ShearY': shear_level_to_args(MAX_LEVEL, replace_value),
+}
+class RandomAugment(object):
+    def __init__(self, N=2, M=10, isPIL=False, augs=[]):
+        self.N = N
+        self.M = M
+        self.isPIL = isPIL
+        if augs:
+            self.augs = augs
+        else:
+            self.augs = list(arg_dict.keys())
+    def get_random_ops(self):
+        sampled_ops = np.random.choice(self.augs, self.N)
+        return [(op, 0.5, self.M) for op in sampled_ops]
+    def __call__(self, img):
+        if self.isPIL:
+            img = np.array(img)
+        ops = self.get_random_ops()
+        for name, prob, level in ops:
+            if np.random.random() > prob:
+                continue
+            args = arg_dict[name](level)
+            img = func_dict[name](img, *args)
+        return img
+def build_transform(is_train, randaug=True, input_size=224, interpolation='bicubic', std_mode='IMAGENET_INCEPTION'):
+    if std_mode == 'IMAGENET_INCEPTION':
+        mean = IMAGENET_INCEPTION_MEAN
+        std = IMAGENET_INCEPTION_STD
+    elif std_mode == 'OPENAI_CLIP':
+        mean = OPENAI_CLIP_MEAN
+        std = OPENAI_CLIP_STD
+    else:
+        raise NotImplementedError
+    if is_train:
+        crop_scale = float(os.environ.get('TRAIN_CROP_SCALE', 0.9999))
+        t = [
+            RandomResizedCropAndInterpolation(
+                input_size, scale=(crop_scale, 1.0), interpolation='bicubic'),
+            # transforms.RandomHorizontalFlip(),
+        ]
+        if randaug and os.environ.get('TRAIN_DO_AUG', 'False') == 'True':
+            print(f'@@@@@ Do random aug during training', flush=True)
+            t.append(
+                RandomAugment(
+                    2, 7, isPIL=True,
+                    augs=[
+                        'Identity', 'AutoContrast', 'Equalize', 'Brightness', 'Sharpness',
+                        'ShearX', 'ShearY', 'TranslateX', 'TranslateY', 'Rotate',
+                    ]))
+        else:
+            print(f'@@@@@ Skip random aug during training', flush=True)
+        t += [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ]
+        t = transforms.Compose(t)
+    else:
+        t = transforms.Compose([
+            transforms.Resize((input_size, input_size),
+                              interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std)
+        ])
+    return t
+def img2b64(img_path):
+    img = Image.open(img_path)  # path to file
+    img_buffer = BytesIO()
+    img.save(img_buffer, format=img.format)
+    byte_data = img_buffer.getvalue()
+    base64_str = base64.b64encode(byte_data)  # bytes
+    base64_str = base64_str.decode("utf-8")  # str
+    return base64_str
+def str2b64(str):
+    return base64.b64encode(str.encode('utf-8')).decode('utf-8')
+def b642str(b64):
+    return base64.b64decode(b64).decode('utf-8')
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def mean(lst):
+    return sum(lst) / len(lst)
+def stop_gradient_by_name(name: str):
+    def apply_fn(module):
+        if hasattr(module, name):
+            getattr(module, name).requires_grad_(False)
+    return apply_fn

r1-a/response_generation/minicpm/MiniCPM-o/omnilmm/train/train_utils.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+import gc
+import copy
+import time
+import torch
+import warnings
+import transformers
+import numpy as np
+from typing import Dict, Optional, Sequence
+from omnilmm import conversation as conversation_lib
+IGNORE_INDEX = -100
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def omni_preprocess(sources,
+                      tokenizer: transformers.PreTrainedTokenizer,
+                      generation=False):
+    system_content = 'You are an artificial intelligence assistant, which gives helpful, detailed, and polite answers to the human\'s questions.'
+    ignore_index = -100
+    response_template = '\n<|assistant|>\n'
+    instruction_template = '\n<|user|>\n'
+    response_token_ids = tokenizer.encode(
+        response_template, add_special_tokens=False)
+    instruction_token_ids = tokenizer.encode(
+        instruction_template, add_special_tokens=False)
+    batch_input_ids = []
+    batch_labels = []
+    for i in range(len(sources)):
+        new_source = []
+        prev_role = 'unexpect'
+        for conv_turn in sources[i]:
+            role = conv_turn['from'] if 'from' in conv_turn else conv_turn['role']
+            content = conv_turn['value'] if 'value' in conv_turn else conv_turn['content']
+            role = 'user' if role == 'human' else role
+            role = 'assistant' if role == 'gpt' else role
+            assert role in ['user', 'assistant']
+            assert role != prev_role, f'role={role}, prev_role={prev_role}'
+            prev_role = role
+            new_turn = {
+                'role': role,
+                'content': content
+            }
+            new_source.append(new_turn)
+        if new_source[0]['role'] != 'system':
+            new_source.insert(0, {'role': 'system', 'content': system_content})
+        # TODO: this automatically add '\n' to the end
+        res_text = tokenizer.apply_chat_template(
+            new_source, tokenize=False, add_generation_prompt=generation)
+        if not generation:
+            res_text = res_text.strip()
+        conversations_tokenized = _tokenize_fn([res_text], tokenizer)
+        res_input_ids = conversations_tokenized["input_ids"][0]
+        # since labels and input_ids are reference towards the same object
+        res_labels = copy.deepcopy(conversations_tokenized["labels"][0])
+        response_token_ids_idxs = []
+        human_token_ids_idxs = []
+        for assistant_idx in np.where(res_labels == response_token_ids[0])[0]:
+            # find the indexes of the start of a response.
+            if (response_token_ids == res_labels[assistant_idx: assistant_idx + len(
+                        response_token_ids)].tolist()
+                    ):
+                response_token_ids_idxs.append(
+                    assistant_idx + len(response_token_ids))
+        if len(response_token_ids_idxs) == 0:
+            warnings.warn(
+                f"Could not find response key `{response_template}` in the "
+                f'following instance: @===>{tokenizer.decode(res_input_ids)}<===@ '
+                f'Raw text is @===>{res_text}<===@'
+                f'Raw source is @===>{new_source}<===@'
+                f"This instance will be ignored in loss calculation. "
+                f"Note, if this happens often, consider increasing the `max_seq_length`."
+            )
+            res_labels[:] = ignore_index
+        human_token_ids = instruction_token_ids
+        for human_idx in np.where(res_labels == human_token_ids[0])[0]:
+            # find the indexes of the start of a human answer.
+            if human_token_ids == res_labels[human_idx: human_idx + len(human_token_ids)].tolist():
+                human_token_ids_idxs.append(human_idx)
+        if len(human_token_ids_idxs) == 0:
+            warnings.warn(
+                f"Could not find instruction key `{instruction_template}` in the "
+                f'following instance: @===>{tokenizer.decode(res_input_ids)}<===@ '
+                f'Raw text is @===>{res_text}<===@'
+                f'Raw source is @===>{new_source}<===@'
+                f"This instance will be ignored in loss calculation. "
+                f"Note, if this happens often, consider increasing the `max_seq_length`."
+            )
+            res_labels[:] = ignore_index
+        for idx, (start, end) in enumerate(zip(human_token_ids_idxs, response_token_ids_idxs)):
+            # Make pytorch loss function ignore all non response tokens
+            if idx != 0:
+                res_labels[start:end] = ignore_index
+            else:
+                res_labels[:end] = ignore_index
+        if len(response_token_ids_idxs) < len(human_token_ids_idxs):
+            res_labels[human_token_ids_idxs[-1]:] = ignore_index
+        batch_input_ids.append(res_input_ids)
+        batch_labels.append(res_labels)
+    return dict(input_ids=batch_input_ids, labels=batch_labels)

r1-a/response_generation/minicpm/MiniCPM-o/quantize/bnb_quantize.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+the script will use bitandbytes to quantize the MiniCPM-Llama3-V-2_5 model.
+the be quantized model can be finetuned by MiniCPM-Llama3-V-2_5 or not.
+you only need to set the model_path 、save_path and run bash code
+cd MiniCPM-V
+python quantize/bnb_quantize.py
+you will get the quantized model in save_path、quantized_model test time and gpu usage
+"""
+import torch
+from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
+from PIL import Image
+import time
+import torch
+import GPUtil
+import os
+assert torch.cuda.is_available(),"CUDA is not available, but this code requires a GPU."
+device = 'cuda'  # Select GPU to use
+model_path = '/root/ld/ld_model_pretrained/MiniCPM-Llama3-V-2_5' # Model download path
+save_path = '/root/ld/ld_model_pretrain/MiniCPM-Llama3-V-2_5_int4' # Quantized model save path
+image_path = './assets/airplane.jpeg'
+# Create a configuration object to specify quantization parameters
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,  # Whether to perform 4-bit quantization
+    load_in_8bit=False,  # Whether to perform 8-bit quantization
+    bnb_4bit_compute_dtype=torch.float16,  # Computation precision setting
+    bnb_4bit_quant_storage=torch.uint8,  # Storage format for quantized weights
+    bnb_4bit_quant_type="nf4",  # Quantization format, here using normally distributed int4
+    bnb_4bit_use_double_quant=True,  # Whether to use double quantization, i.e., quantizing zeropoint and scaling parameters
+    llm_int8_enable_fp32_cpu_offload=False,  # Whether LLM uses int8, with fp32 parameters stored on the CPU
+    llm_int8_has_fp16_weight=False,  # Whether mixed precision is enabled
+    llm_int8_skip_modules=["out_proj", "kv_proj", "lm_head"],  # Modules not to be quantized
+    llm_int8_threshold=6.0  # Outlier value in the llm.int8() algorithm, distinguishing whether to perform quantization based on this value
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    model_path,
+    device_map=device,  # Allocate model to device
+    quantization_config=quantization_config,
+    trust_remote_code=True
+)
+gpu_usage = GPUtil.getGPUs()[0].memoryUsed
+start=time.time()
+response = model.chat(
+    image=Image.open(image_path).convert("RGB"),
+    msgs=[
+        {
+            "role": "user",
+            "content": "What is in this picture?"
+        }
+    ],
+    tokenizer=tokenizer
+) # 模型推理
+print('Output after quantization:',response)
+print('Inference time after quantization:',time.time()-start)
+print(f"GPU memory usage after quantization: {round(gpu_usage/1024,2)}GB")
+"""
+Expected output:
+    Output after quantization: This picture contains specific parts of an airplane, including wings, engines, and tail sections. These components are key parts of large commercial aircraft.
+    The wings support lift during flight, while the engines provide thrust to move the plane forward. The tail section is typically used for stabilizing flight and plays a role in airline branding.
+    The design and color of the airplane indicate that it belongs to Air China, likely a passenger aircraft due to its large size and twin-engine configuration.
+    There are no markings or insignia on the airplane indicating the specific model or registration number; such information may require additional context or a clearer perspective to discern.
+    Inference time after quantization: 8.583992719650269 seconds
+    GPU memory usage after quantization: 6.41 GB
+"""
+# Save the model and tokenizer
+os.makedirs(save_path, exist_ok=True)
+model.save_pretrained(save_path, safe_serialization=True)
+tokenizer.save_pretrained(save_path)

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/chatbot_web_demo_o2.6.py ADDED Viewed

	@@ -0,0 +1,552 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+from PIL import Image
+from decord import VideoReader, cpu
+import io
+import os
+import copy
+import requests
+import base64
+import json
+import traceback
+import re
+import modelscope_studio as mgr
+# README, How to run demo on different devices
+# For Nvidia GPUs.
+# python chatbot_web_demo_o2.6.py
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--model', type=str , default="openbmb/MiniCPM-o-2_6", help="huggingface model name or local path")
+parser.add_argument('--multi-gpus', action='store_true', default=False, help='use multi-gpus')
+args = parser.parse_args()
+device = "cuda"
+model_name = 'MiniCPM-o 2.6'
+# Load model
+model_path = args.model
+if args.multi_gpus:
+    from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
+    with init_empty_weights():
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16,
+            init_audio=False, init_tts=False)
+    device_map = infer_auto_device_map(model, max_memory={0: "10GB", 1: "10GB"},
+        no_split_module_classes=['SiglipVisionTransformer', 'Qwen2DecoderLayer'])
+    device_id = device_map["llm.model.embed_tokens"]
+    device_map["llm.lm_head"] = device_id # firtt and last layer should be in same device
+    device_map["vpm"] = device_id
+    device_map["resampler"] = device_id
+    device_id2 = device_map["llm.model.layers.26"]
+    device_map["llm.model.layers.8"] = device_id2
+    device_map["llm.model.layers.9"] = device_id2
+    device_map["llm.model.layers.10"] = device_id2
+    device_map["llm.model.layers.11"] = device_id2
+    device_map["llm.model.layers.12"] = device_id2
+    device_map["llm.model.layers.13"] = device_id2
+    device_map["llm.model.layers.14"] = device_id2
+    device_map["llm.model.layers.15"] = device_id2
+    device_map["llm.model.layers.16"] = device_id2
+    #print(device_map)
+    model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
+else:
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, init_audio=False, init_tts=False)
+    model = model.to(device=device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
+ERROR_MSG = "Error, please retry"
+MAX_NUM_FRAMES = 64
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
+def get_file_extension(filename):
+    return os.path.splitext(filename)[1].lower()
+def is_image(filename):
+    return get_file_extension(filename) in IMAGE_EXTENSIONS
+def is_video(filename):
+    return get_file_extension(filename) in VIDEO_EXTENSIONS
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
+    return mgr.MultimodalInput(
+        upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
+        upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
+        submit_button_props={'label': 'Submit'}
+    )
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    try:
+        print('msgs:', msgs)
+        answer = model.chat(
+            image=None,
+            msgs=msgs,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        print('answer:', answer)
+        return 0, answer, None, None
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+def encode_image(image):
+    if not isinstance(image, Image.Image):
+        if hasattr(image, 'path'):
+            image = Image.open(image.path).convert("RGB")
+        else:
+            image = Image.open(image.file.path).convert("RGB")
+    # resize to max_size
+    max_size = 448*16
+    if max(image.size) > max_size:
+        w,h = image.size
+        if w > h:
+            new_w = max_size
+            new_h = int(h * max_size / w)
+        else:
+            new_h = max_size
+            new_w = int(w * max_size / h)
+        image = image.resize((new_w, new_h), resample=Image.BICUBIC)
+    return image
+    ## save by BytesIO and convert to base64
+    #buffered = io.BytesIO()
+    #image.save(buffered, format="png")
+    #im_b64 = base64.b64encode(buffered.getvalue()).decode()
+    #return {"type": "image", "pairs": im_b64}
+def encode_video(video):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    if hasattr(video, 'path'):
+        vr = VideoReader(video.path, ctx=cpu(0))
+    else:
+        vr = VideoReader(video.file.path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx)>MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    video = vr.get_batch(frame_idx).asnumpy()
+    video = [Image.fromarray(v.astype('uint8')) for v in video]
+    video = [encode_image(v) for v in video]
+    print('video frames:', len(video))
+    return video
+def check_mm_type(mm_file):
+    if hasattr(mm_file, 'path'):
+        path = mm_file.path
+    else:
+        path = mm_file.file.path
+    if is_image(path):
+        return "image"
+    if is_video(path):
+        return "video"
+    return None
+def encode_mm_file(mm_file):
+    if check_mm_type(mm_file) == 'image':
+        return [encode_image(mm_file)]
+    if check_mm_type(mm_file) == 'video':
+        return encode_video(mm_file)
+    return None
+def make_text(text):
+    #return {"type": "text", "pairs": text} # # For remote call
+    return text
+def encode_message(_question):
+    files = _question.files
+    question = _question.text
+    pattern = r"\[mm_media\]\d+\[/mm_media\]"
+    matches = re.split(pattern, question)
+    message = []
+    if len(matches) != len(files) + 1:
+        gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
+    assert len(matches) == len(files) + 1
+    text = matches[0].strip()
+    if text:
+        message.append(make_text(text))
+    for i in range(len(files)):
+        message += encode_mm_file(files[i])
+        text = matches[i + 1].strip()
+        if text:
+            message.append(make_text(text))
+    return message
+def check_has_videos(_question):
+    images_cnt = 0
+    videos_cnt = 0
+    for file in _question.files:
+        if check_mm_type(file) == "image":
+            images_cnt += 1
+        else:
+            videos_cnt += 1
+    return images_cnt, videos_cnt
+def count_video_frames(_context):
+    num_frames = 0
+    for message in _context:
+        for item in message["content"]:
+            #if item["type"] == "image": # For remote call
+            if isinstance(item, Image.Image):
+                num_frames += 1
+    return num_frames
+def respond(_question, _chat_bot, _app_cfg, params_form):
+    _context = _app_cfg['ctx'].copy()
+    _context.append({'role': 'user', 'content': encode_message(_question)})
+    images_cnt = _app_cfg['images_cnt']
+    videos_cnt = _app_cfg['videos_cnt']
+    files_cnts = check_has_videos(_question)
+    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
+        gr.Warning("Only supports single video file input right now!")
+        return _question, _chat_bot, _app_cfg
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 2048
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.05,
+            "max_new_tokens": 2048
+        }
+    if files_cnts[1] + videos_cnt > 0:
+        params["max_inp_length"] = 4352 # 4096+256
+        params["use_image_id"] = False
+        params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
+    code, _answer, _, sts = chat("", _context, None, params)
+    images_cnt += files_cnts[0]
+    videos_cnt += files_cnts[1]
+    _context.append({"role": "assistant", "content": [make_text(_answer)]})
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    _app_cfg['images_cnt'] = images_cnt
+    _app_cfg['videos_cnt'] = videos_cnt
+    upload_image_disabled = videos_cnt > 0
+    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+    return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
+def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
+    ctx = _app_cfg["ctx"]
+    message_item = []
+    if _image is not None:
+        image = Image.open(_image).convert("RGB")
+        ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
+        message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
+    else:
+        if _user_message:
+            ctx.append({"role": "user", "content": [make_text(_user_message)]})
+            message_item.append({"text": _user_message, "files": []})
+        else:
+            message_item.append(None)
+    if _assistant_message:
+        ctx.append({"role": "assistant", "content": [make_text(_assistant_message)]})
+        message_item.append({"text": _assistant_message, "files": []})
+    else:
+        message_item.append(None)
+    _chat_bot.append(message_item)
+    return None, "", "", _chat_bot, _app_cfg
+def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form):
+    user_message_contents = []
+    _context = _app_cfg["ctx"].copy()
+    if _image:
+        image = Image.open(_image).convert("RGB")
+        user_message_contents += [encode_image(image)]
+    if _user_message:
+        user_message_contents += [make_text(_user_message)]
+    if user_message_contents:
+        _context.append({"role": "user", "content": user_message_contents})
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 2048
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.05,
+            "max_new_tokens": 2048
+        }
+    code, _answer, _, sts = chat("", _context, None, params)
+    _context.append({"role": "assistant", "content": [make_text(_answer)]})
+    if _image:
+        _chat_bot.append([
+            {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
+            {"text": _answer, "files": []}
+        ])
+    else:
+        _chat_bot.append([
+            {"text": _user_message, "files": [_image]},
+            {"text": _answer, "files": []}
+        ])
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return None, '', '', _chat_bot, _app_cfg
+def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form):
+    if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
+        gr.Warning('No question for regeneration.')
+        return '', _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+    if _app_cfg["chat_type"] == "Chat":
+        images_cnt = _app_cfg['images_cnt']
+        videos_cnt = _app_cfg['videos_cnt']
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        files_cnts = check_has_videos(_question)
+        images_cnt -= files_cnts[0]
+        videos_cnt -= files_cnts[1]
+        _app_cfg['images_cnt'] = images_cnt
+        _app_cfg['videos_cnt'] = videos_cnt
+        upload_image_disabled = videos_cnt > 0
+        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+        _question, _chat_bot, _app_cfg = respond(_question, _chat_bot, _app_cfg, params_form)
+        return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+    else:
+        last_message = _chat_bot[-1][0]
+        last_image = None
+        last_user_message = ''
+        if last_message.text:
+            last_user_message = last_message.text
+        if last_message.files:
+            last_image = last_message.files[0].file.path
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        _image, _user_message, _assistant_message, _chat_bot, _app_cfg = fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form)
+        return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+def flushed():
+    return gr.update(interactive=True)
+def clear(txt_message, chat_bot, app_session):
+    txt_message.files.clear()
+    txt_message.text = ''
+    chat_bot = copy.deepcopy(init_conversation)
+    app_session['sts'] = None
+    app_session['ctx'] = []
+    app_session['images_cnt'] = 0
+    app_session['videos_cnt'] = 0
+    return create_multimodal_input(), chat_bot, app_session, None, '', ''
+def select_chat_type(_tab, _app_cfg):
+    _app_cfg["chat_type"] = _tab
+    return _app_cfg
+init_conversation = [
+    [
+        None,
+        {
+            # The first message of bot closes the typewriter.
+            "text": "You can talk to me now",
+            "flushing": False
+        }
+    ],
+]
+css = """
+video { height: auto !important; }
+.example label { font-size: 16px;}
+"""
+introduction = """
+## Features:
+1. Chat with single image
+2. Chat with multiple images
+3. Chat with video
+4. In-context few-shot learning
+Click `How to use` tab to see examples.
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Tab(model_name):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=300):
+                gr.Markdown(value=introduction)
+                params_form = create_component(form_radio, comp='Radio')
+                regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+                clear_button = create_component({'value': 'Clear History'}, comp='Button')
+            with gr.Column(scale=3, min_width=500):
+                app_session = gr.State({'sts':None,'ctx':[], 'images_cnt': 0, 'videos_cnt': 0, 'chat_type': 'Chat'})
+                chat_bot = mgr.Chatbot(label=f"Chat with {model_name}", value=copy.deepcopy(init_conversation), height=600, flushing=False, bubble_full_width=False)
+                with gr.Tab("Chat") as chat_tab:
+                    txt_message = create_multimodal_input()
+                    chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
+                    txt_message.submit(
+                        respond,
+                        [txt_message, chat_bot, app_session, params_form],
+                        [txt_message, chat_bot, app_session]
+                    )
+                with gr.Tab("Few Shot") as fewshot_tab:
+                    fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            image_input = gr.Image(type="filepath", sources=["upload"])
+                        with gr.Column(scale=3):
+                            user_message = gr.Textbox(label="User")
+                            assistant_message = gr.Textbox(label="Assistant")
+                            with gr.Row():
+                                add_demonstration_button = gr.Button("Add Example")
+                                generate_button = gr.Button(value="Generate", variant="primary")
+                    add_demonstration_button.click(
+                        fewshot_add_demonstration,
+                        [image_input, user_message, assistant_message, chat_bot, app_session],
+                        [image_input, user_message, assistant_message, chat_bot, app_session]
+                    )
+                    generate_button.click(
+                        fewshot_respond,
+                        [image_input, user_message, chat_bot, app_session, params_form],
+                        [image_input, user_message, assistant_message, chat_bot, app_session]
+                    )
+                chat_tab.select(
+                    select_chat_type,
+                    [chat_tab_label, app_session],
+                    [app_session]
+                )
+                chat_tab.select( # do clear
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+                fewshot_tab.select(
+                    select_chat_type,
+                    [fewshot_tab_label, app_session],
+                    [app_session]
+                )
+                fewshot_tab.select( # do clear
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+                chat_bot.flushed(
+                    flushed,
+                    outputs=[txt_message]
+                )
+                regenerate.click(
+                    regenerate_button_clicked,
+                    [txt_message, image_input, user_message, assistant_message, chat_bot, app_session, params_form],
+                    [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
+                )
+                clear_button.click(
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+    with gr.Tab("How to use"):
+        with gr.Column():
+            with gr.Row():
+                image_example = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/m_bear2.gif", label='1. Chat with single or multiple images', interactive=False, width=400, elem_classes="example")
+                example2 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/video2.gif", label='2. Chat with video', interactive=False, width=400, elem_classes="example")
+                example3 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/fshot.gif", label='3. Few shot', interactive=False, width=400, elem_classes="example")
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8000, server_name="0.0.0.0")

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/model_server.py ADDED Viewed

	@@ -0,0 +1,936 @@

+import base64
+import json
+import asyncio
+import numpy as np
+import os, sys, io
+import threading
+import time
+import aiofiles
+import librosa
+import soundfile
+import wave
+from typing import Dict, List, Any, Optional
+import argparse
+import logging
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer, AutoProcessor
+import uvicorn
+from fastapi import FastAPI, Header, Query, Request, HTTPException, WebSocket, WebSocketDisconnect
+from fastapi.responses import JSONResponse, StreamingResponse
+cur_path = os.path.split(os.path.realpath(__file__))[0]
+sys.path.append(os.path.abspath(cur_path))
+import vad_utils
+def setup_logger():
+    logger = logging.getLogger("api_logger")
+    logger.setLevel(logging.DEBUG)
+    # Create formatter
+    formatter = logging.Formatter(
+        '%(asctime)s.%(msecs)03d-%(levelname)s-[%(filename)s:%(lineno)d] - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+    # Create handlers for stdout and stderr
+    stdout_handler = logging.StreamHandler(sys.stdout)
+    stdout_handler.setLevel(logging.INFO)  # INFO and DEBUG go to stdout
+    stdout_handler.setFormatter(formatter)
+    stdout_handler.addFilter(lambda record: record.levelno <= logging.INFO)
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stderr_handler.setLevel(logging.WARNING)  # WARNING, ERROR, CRITICAL go to stderr
+    stderr_handler.setFormatter(formatter)
+    # Add handlers to logger
+    logger.addHandler(stdout_handler)
+    logger.addHandler(stderr_handler)
+    return logger
+app = FastAPI()
+logger = setup_logger()
+ap = argparse.ArgumentParser()
+ap.add_argument('--port', type=int , default=32550)
+ap.add_argument('--model', type=str , default="openbmb/MiniCPM-o-2_6", help="huggingface model name or local path")
+args = ap.parse_args()
+class StreamManager:
+    def __init__(self):
+        self.uid = None
+        self.is_streaming_complete = threading.Event()
+        self.conversation_started = threading.Event()
+        self.last_request_time = None
+        self.last_stream_time = None
+        self.timeout = 900  # seconds timeout
+        self.stream_timeout = 3  # seconds no stream
+        self.num_stream = 0
+        self.stream_started = False
+        self.stop_response = False
+        # VAD settings
+        self.vad_options = vad_utils.VadOptions()
+        self.vad_sequence_length = 5
+        self.vad_sequence = []
+        self.audio_prefill = []
+        self.audio_input = []
+        self.image_prefill = None
+        self.audio_chunk = 200
+        # customized options
+        self.customized_audio = None
+        self.customized_options = None
+        # Omni model
+        self.target_dtype = torch.bfloat16
+        self.device='cuda:0'
+        self.minicpmo_model_path = args.model #"openbmb/MiniCPM-o-2_6"
+        self.model_version = "2.6"
+        with torch.no_grad():
+            self.minicpmo_model = AutoModel.from_pretrained(self.minicpmo_model_path, trust_remote_code=True, torch_dtype=self.target_dtype, attn_implementation='sdpa')
+        self.minicpmo_tokenizer = AutoTokenizer.from_pretrained(self.minicpmo_model_path, trust_remote_code=True)
+        self.minicpmo_model.init_tts()
+        # self.minicpmo_model.tts.float()
+        self.minicpmo_model.to(self.device).eval()
+        self.ref_path_video_default = "assets/ref_audios/video_default.wav"
+        self.ref_path_default = "assets/ref_audios/default.wav"
+        self.ref_path_female = "assets/ref_audios/female_example.wav"
+        self.ref_path_male = "assets/ref_audios/male_example.wav"
+        self.input_audio_id = 0
+        self.input_audio_vad_id = 0
+        self.input_image_id = 0
+        self.output_audio_id = 0
+        self.flag_decode = False
+        self.cnts = None
+        self.all_start_time = time.time()
+        self.session_id = 233
+        self.sys_prompt_flag = False
+        self.vad_time = 0
+        self.ls_time = 0
+        self.msg_type = 1
+        self.speaking_time_stamp = 0
+        self.cycle_wait_time = 12800/24000 + 0.15
+        self.extra_wait_time = 2.5
+        self.server_wait = True
+        self.past_session_id = 0
+        self.sys_prompt_init(0)
+        self.session_id += 1
+    def start_conversation(self):
+        logger.info(f"uid {self.uid}: new conversation started.")
+        self.conversation_started.set()
+        self.stop_response = False
+    def update_last_request_time(self):
+        self.last_request_time = time.time()
+        #logger.info(f"update last_request_time {self.last_request_time}")
+    def update_last_stream_time(self):
+        self.last_stream_time = time.time()
+        #logger.info(f"update last_stream_time {self.last_stream_time}")
+    def move_to_device(self, obj, device):
+        if isinstance(obj, torch.Tensor):
+            obj_ = obj.to(device)
+            if (obj_.dtype == torch.float) or (obj_.dtype == torch.half):
+                # cast to `torch.bfloat16`
+                obj_ = obj_.to(self.target_dtype)
+            return obj_
+        elif isinstance(obj, dict):
+            return {key: self.move_to_device(value, device) for key, value in obj.items()}
+        elif isinstance(obj, list):
+            return [self.move_to_device(item, device) for item in obj]
+        elif isinstance(obj, tuple):
+            return tuple(self.move_to_device(item, device) for item in obj)
+        elif isinstance(obj, set):
+            return {self.move_to_device(item, device) for item in obj}
+        else:
+            return obj
+    def reset(self):
+        logger.info("reset")
+        self.is_streaming_complete.clear()
+        self.conversation_started.clear()
+        self.last_request_time = None
+        self.last_stream_time = None
+        self.audio_buffer_raw = bytearray()
+        self.num_stream = 0
+        self.stream_started = False
+        self.stop_response = False
+        # self.customized_audio = None
+        # self.customized_options = None
+        # clear model
+        self.clear()
+    def merge_wav_files(self, input_bytes_list, output_file):
+        with wave.open(io.BytesIO(input_bytes_list[0]), 'rb') as wav:
+            params = wav.getparams()
+            n_channels, sampwidth, framerate, n_frames, comptype, compname = params
+        with wave.open(output_file, 'wb') as output_wav:
+            output_wav.setnchannels(n_channels)
+            output_wav.setsampwidth(sampwidth)
+            output_wav.setframerate(framerate)
+            output_wav.setcomptype(comptype, compname)
+            for wav_bytes in input_bytes_list:
+                with wave.open(io.BytesIO(wav_bytes), 'rb') as wav:
+                    output_wav.writeframes(wav.readframes(wav.getnframes()))
+    def is_timed_out(self):
+        if self.last_request_time is not None:
+            return time.time() - self.last_request_time > self.timeout
+        return False
+    def no_active_stream(self):
+        if self.last_stream_time is not None and self.stream_started:
+            no_stream_duration = time.time() - self.last_stream_time
+            if no_stream_duration > self.stream_timeout:
+                #logger.info(f"no active stream for {no_stream_duration} secs.")
+                return True
+        return False
+    def sys_prompt_init(self, msg_type):
+        if self.past_session_id == self.session_id:
+            return
+        logger.info("### sys_prompt_init ###")
+        logger.info(f'msg_type is {msg_type}')
+        if msg_type <= 1: #audio
+            audio_voice_clone_prompt = "Use the voice in the audio prompt to synthesize new content."
+            audio_assistant_prompt = "You are a helpful assistant with the above voice style."
+            ref_path = self.ref_path_default
+            if self.customized_options is not None:
+                audio_voice_clone_prompt = self.customized_options['voice_clone_prompt']
+                audio_assistant_prompt = self.customized_options['assistant_prompt']
+                if self.customized_options['use_audio_prompt'] == 1:
+                    ref_path = self.ref_path_default
+                elif self.customized_options['use_audio_prompt'] == 2:
+                    ref_path = self.ref_path_female
+                elif self.customized_options['use_audio_prompt'] == 3:
+                    ref_path = self.ref_path_male
+            audio_prompt, sr = librosa.load(ref_path, sr=16000, mono=True)
+            sys_msg = {'role': 'user', 'content': [audio_voice_clone_prompt + "\n", audio_prompt, "\n" + audio_assistant_prompt]}
+        elif msg_type == 2: #video
+            voice_clone_prompt="你是一个AI助手。你能接受视频，音频和文本输入并输出语音和文本。模仿输入音频中的声音特征。"
+            assistant_prompt="作为助手，你将使用这种声音风格说话。"
+            ref_path = self.ref_path_video_default
+            if self.customized_options is not None:
+                voice_clone_prompt = self.customized_options['voice_clone_prompt']
+                assistant_prompt = self.customized_options['assistant_prompt']
+                if self.customized_options['use_audio_prompt'] == 1:
+                    ref_path = self.ref_path_default
+                elif self.customized_options['use_audio_prompt'] == 2:
+                    ref_path = self.ref_path_female
+                elif self.customized_options['use_audio_prompt'] == 3:
+                    ref_path = self.ref_path_male
+            audio_prompt, sr = librosa.load(ref_path, sr=16000, mono=True)
+            sys_msg = {'role': 'user', 'content': [voice_clone_prompt, audio_prompt, assistant_prompt]}
+        # elif msg_type == 3: #user start
+        #     assistant_prompt="作为助手，你将使用这种声音风格说话。"
+        #     if self.customized_options is not None:
+        #         assistant_prompt = self.customized_options['assistant_prompt']
+        #     sys_msg = {'role': 'user', 'content': [assistant_prompt]}
+        self.msg_type = msg_type
+        msgs = [sys_msg]
+        if self.customized_options is not None:
+            if self.customized_options['use_audio_prompt'] > 0:
+                self.minicpmo_model.streaming_prefill(
+                    session_id=str(self.session_id),
+                    msgs=msgs,
+                    tokenizer=self.minicpmo_tokenizer,
+                )
+        if msg_type == 0:
+            self.minicpmo_model.streaming_prefill(
+                session_id=str(self.session_id),
+                msgs=msgs,
+                tokenizer=self.minicpmo_tokenizer,
+            )
+        self.savedir = os.path.join(f"./log_data/{args.port}/", str(time.time()))
+        if not os.path.exists(self.savedir):
+            os.makedirs(self.savedir)
+        if not os.path.exists(self.savedir + "/input_audio_log"):
+            os.makedirs(self.savedir + "/input_audio_log")
+        if not os.path.exists(self.savedir + "/input_audio_vad_log"):
+            os.makedirs(self.savedir + "/input_audio_vad_log")
+        if not os.path.exists(self.savedir + "/input_image_log"):
+            os.makedirs(self.savedir + "/input_image_log")
+        if not os.path.exists(self.savedir + "/output_audio_log"):
+            os.makedirs(self.savedir + "/output_audio_log")
+        if not os.path.exists(self.savedir + "/feedback_log"):
+            os.makedirs(self.savedir + "/feedback_log")
+        if not os.path.exists(self.savedir + "/input_audio"):
+            os.makedirs(self.savedir + "/input_audio")
+        self.past_session_id = self.session_id
+        self.audio_prefill = []
+        self.audio_input = []
+    def clear(self):
+        try:
+            self.flag_decode = False
+            self.stream_started = False
+            self.cnts = None
+            self.vad_sequence = []
+            self.audio_prefill = []
+            self.audio_input = []
+            self.image_prefill = None
+            if self.minicpmo_model.llm_past_key_values[0][0].shape[2]>8192:
+                self.session_id += 1  # to clear all kv cache
+                self.sys_prompt_flag = False
+            self.vad_time = 0
+            self.ls_time = 0
+            self.msg_type = 1
+        except Exception as e:
+            raise ValueError(f"Clear error: {str(e)}")
+    def process_message(self, message: Dict[str, Any]):
+        try:
+            # Process content items
+            audio_data = None
+            image_data = None
+            for content_item in message["content"]:
+                if content_item["type"] == "stop_response":
+                    logger.info("process_message: received request to stop_response")
+                    self.stop_response = True
+                    return "stop"
+                elif content_item["type"] == "input_audio":
+                    audio_data = content_item["input_audio"]["data"]
+                    audio_timestamp = content_item["input_audio"].get("timestamp", "")
+                elif content_item["type"] == "image_data":
+                    image_data = content_item["image_data"]["data"]
+            if audio_data is None:
+                return "empty audio"
+            if self.conversation_started.is_set() and self.is_streaming_complete.is_set():
+                logger.info("conversation not started or still in generation, skip stream message.")
+                return "skip"
+            if self.flag_decode:
+                return "skip"
+            try:
+                audio_bytes = base64.b64decode(audio_data)
+                image = None
+                if image_data is not None:
+                    if len(image_data) > 0:
+                        image_bytes = base64.b64decode(image_data)
+                        image_buffer = io.BytesIO(image_bytes)
+                        image_buffer.seek(0)
+                        image = Image.open(image_buffer)
+                        # logger.info("read image")
+                if self.sys_prompt_flag is False:
+                    self.all_start_time = time.time()
+                    self.sys_prompt_flag = True
+                    if image_data is not None:
+                        self.sys_prompt_init(2)
+                    else:
+                        self.sys_prompt_init(1)
+                self.prefill(audio_bytes, image, False)
+                self.vad_sequence.append(audio_bytes)
+                if len(self.vad_sequence) < self.vad_sequence_length:
+                    # logger.info('length of vad_sequence is {}, insufficient'.format(self.vad_sequence_length))
+                    return "done"
+                elif len(self.vad_sequence) > self.vad_sequence_length:
+                    # logger.info('length of vad_sequence exceeds {}'.format(self.vad_sequence_length))
+                    self.vad_sequence.pop(0)
+                self.vad_check_audio_bytes(audio_bytes, image, 16000)
+                return "done"
+            except Exception as e:
+                raise ValueError(f"Audio processing error: {str(e)}")
+        except Exception as e:
+            raise ValueError(f"Message processing error: {str(e)}")
+    def resample_audio(self, input_path, src_sr, tar_sr, output_path):
+        audio_data, _ = librosa.load(input_path, sr=src_sr)
+        audio_new = librosa.resample(audio_data, orig_sr=src_sr, target_sr=tar_sr)
+        soundfile.write(output_path, audio_new, tar_sr)
+    def calculate_rms(self, input_path, sr):
+        audio_data, _ = librosa.load(input_path, sr=sr)
+        return (np.sqrt(np.mean(audio_data**2)) > 0.002)
+    def vad_check_audio_bytes(self, audio, image, sr):
+        try:
+            input_audio_vad_path = self.savedir + f"/input_audio_vad_log/vad_{self.input_audio_vad_id}.wav"
+            self.input_audio_vad_id += 1
+            self.merge_wav_files(self.vad_sequence, input_audio_vad_path)
+            with open(input_audio_vad_path,"rb") as f:
+                temp_audio = f.read()
+            dur_vad, vad_audio_bytes, time_vad = vad_utils.run_vad(temp_audio, sr, self.vad_options)
+            if self.customized_options is not None:
+                vad_threshold = 1 - self.customized_options['vad_threshold']
+            else:
+                vad_threshold = 0.2
+            if self.calculate_rms(input_audio_vad_path, sr) and dur_vad > 0.4:
+                if self.stream_started == False:
+                    self.vad_time = time.time()
+                    self.stream_started = True
+            elif dur_vad < vad_threshold:
+                if self.stream_started:
+                    self.stream_started = False
+                    if (time.time() - self.vad_time >= 0.6):
+                        self.prefill(audio, image, True)
+                        self.is_streaming_complete.set()
+                        # self.ls_time = time.time()
+        except Exception as e:
+            logger.error(f"VAD error: {e}")
+            raise
+        return
+    def prefill(self, audio, image, is_end):
+        if self.server_wait:
+            now = time.time()
+            await_time = self.speaking_time_stamp - now + self.extra_wait_time
+            if await_time > 0:
+                return False
+        if self.flag_decode:
+            return False
+        if image is not None:
+            self.image_prefill = image
+        try:
+            if is_end == False:
+                self.audio_prefill.append(audio)
+                self.audio_input.append(audio)
+            slice_nums = 1
+            if is_end and self.customized_options is not None:
+                if self.customized_options['hd_video']:
+                    slice_nums = 6
+                else:
+                    return True
+            if (len(self.audio_prefill) == (1000/self.audio_chunk)) or (is_end and len(self.audio_prefill)>0):
+                time_prefill = time.time()
+                input_audio_path = self.savedir + f"/input_audio_log/input_audio_{self.input_audio_id}.wav"
+                self.merge_wav_files(self.audio_prefill, input_audio_path)
+                with open(input_audio_path,"rb") as wav_io:
+                    signal, sr = soundfile.read(wav_io, dtype='float32')
+                    soundfile.write(input_audio_path, signal, 16000)
+                    audio_np, sr = librosa.load(input_audio_path, sr=16000, mono=True)
+                self.audio_prefill = []
+                if len(audio_np) > 16000:
+                    audio_np = audio_np[:16000]
+                with torch.no_grad():
+                    if self.image_prefill is not None:
+                        input_image_path = self.savedir + f'/input_image_log/input_image_{self.input_audio_id}.png'
+                        self.image_prefill.save(input_image_path, 'PNG')
+                        self.image_prefill = self.image_prefill.convert("RGB")
+                    cnts = None
+                    if self.image_prefill is not None:
+                        cnts = ["<unit>", self.image_prefill, audio_np]
+                    else:
+                        cnts = [audio_np]
+                    if cnts is not None:
+                        msg = {"role":"user", "content": cnts}
+                        msgs = [msg]
+                        res = self.minicpmo_model.streaming_prefill(
+                            session_id=str(self.session_id),
+                            msgs=msgs,
+                            tokenizer=self.minicpmo_tokenizer,
+                            max_slice_nums=slice_nums,
+                        )
+                self.input_audio_id += 1
+            return True
+        except Exception as e:
+            logger.error(f"prefill error: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+    def generate_end(self):
+        self.input_audio_id += 10
+        self.output_audio_id += 10
+        self.flag_decode = False
+        self.reset()
+        return
+    async def generate(self):
+        """ return audio bytes and response text (optional) """
+        if self.stop_response:
+            self.generate_end()
+            return
+        self.flag_decode = True
+        try:
+            with torch.no_grad():
+                logger.info("=== model gen start ===")
+                time_gen = time.time()
+                input_audio_path = self.savedir + f"/input_audio/all_input_audio_{self.input_audio_id}.wav"
+                self.merge_wav_files(self.audio_input, input_audio_path)
+                audio_stream = None
+                try:
+                    with open(input_audio_path, 'rb') as wav_file:
+                        audio_stream = wav_file.read()
+                except FileNotFoundError:
+                    print(f"File {input_audio_path} not found.")
+                yield base64.b64encode(audio_stream).decode('utf-8'), "assistant:\n"
+                print('=== gen start: ', time.time() - time_gen)
+                first_time = True
+                temp_time = time.time()
+                temp_time1 = time.time()
+                with torch.inference_mode():
+                    if self.stop_response:
+                        self.generate_end()
+                        return
+                    self.minicpmo_model.config.stream_input=True
+                    msg = {"role":"user", "content": self.cnts}
+                    msgs = [msg]
+                    text = ''
+                    self.speaking_time_stamp = time.time()
+                    try:
+                        for r in self.minicpmo_model.streaming_generate(
+                            session_id=str(self.session_id),
+                            tokenizer=self.minicpmo_tokenizer,
+                            generate_audio=True,
+                            # enable_regenerate=True,
+                        ):
+                            if self.stop_response:
+                                self.generate_end()
+                                return
+                            audio_np, sr, text = r["audio_wav"], r["sampling_rate"], r["text"]
+                            output_audio_path = self.savedir + f'/output_audio_log/output_audio_{self.output_audio_id}.wav'
+                            self.output_audio_id += 1
+                            soundfile.write(output_audio_path, audio_np, samplerate=sr)
+                            audio_stream = None
+                            try:
+                                with open(output_audio_path, 'rb') as wav_file:
+                                    audio_stream = wav_file.read()
+                            except FileNotFoundError:
+                                print(f"File {output_audio_path} not found.")
+                            temp_time1 = time.time()
+                            print('text: ', text)
+                            yield base64.b64encode(audio_stream).decode('utf-8'), text
+                            self.speaking_time_stamp += self.cycle_wait_time
+                    except Exception as e:
+                        logger.error(f"Error happened during generation: {str(e)}")
+                    yield None, '\n<end>'
+        except Exception as e:
+            logger.error(f"发生异常:{e}")
+            import traceback
+            traceback.print_exc()
+            raise
+        finally:
+            logger.info(f"uid {self.uid}: generation finished!")
+            self.generate_end()
+    async def check_activity(self):
+        while True:
+            # Check for overall inactivity (30 minutes)
+            if self.is_timed_out():
+                self.reset()
+            if self.no_active_stream() and not self.is_streaming_complete.is_set():
+               self.is_streaming_complete.set()
+            await asyncio.sleep(1)  # Check every second
+    def upload_customized_audio(self, audio_data, audio_fmt):
+        self.customized_audio = None
+        try:
+            if audio_data is not None and len(audio_data) > 0:
+                # if audio_fmt == "mp3" or audio_fmt == "wav":
+                audio_bytes = base64.b64decode(audio_data)
+                fio = io.BytesIO(audio_bytes)
+                fio.seek(0)
+                audio_np, sr = librosa.load(fio, sr=16000, mono=True)
+                if audio_np is not None and len(audio_np) > 1000:
+                    output_audio_path = self.savedir + f'/customized_audio.wav'
+                    soundfile.write(output_audio_path, audio_np, sr)
+                    self.customized_audio = output_audio_path
+                    logger.info(f"processed customized {audio_fmt} audio")
+                    print(audio_np.shape, type(audio_np), sr)
+            else:
+                logger.info(f"empty customized audio, use default value instead.")
+                self.customized_audio = None
+        except Exception as e:
+            raise ValueError(f"Process customized audio error: {str(e)}")
+    def update_customized_options(self, uid, options):
+        self.customized_options = None
+        if options is None:
+            raise ValueError("Invalid None type for options, expected dict type")
+        self.customized_options = options
+        logger.info(f"uid: {uid} set customized_options to {options}")
+stream_manager = StreamManager()
+@app.on_event("startup")
+async def startup_event():
+    logger.info("Starting application and activity checker")
+    asyncio.create_task(stream_manager.check_activity())
+@app.on_event("shutdown")
+async def shutdown_event():
+    logger.info("Shutting down application")
+@app.post("/stream")
+@app.post("/api/v1/stream")
+async def stream(request: Request, uid: Optional[str] = Header(None)):
+    global stream_manager
+    stream_manager.update_last_request_time()
+    stream_manager.update_last_stream_time()
+    if not uid:
+        raise HTTPException(status_code=400, detail="Missing uid in headers")
+    if stream_manager.uid is not None and stream_manager.uid != uid:
+        logger.error(f"uid changed during steram: previous uid {stream_manager.uid}, new uid {uid}")
+        raise HTTPException(status_code=400, detail="uid changed in stream")
+    try:
+        # Parse JSON request
+        data = await request.json()
+        # Validate basic structure
+        if not isinstance(data, dict) or "messages" not in data:
+            raise HTTPException(status_code=400, detail="Invalid request format")
+        # Process messages
+        reason = ""
+        for message in data["messages"]:
+            if not isinstance(message, dict) or "role" not in message or "content" not in message:
+                raise HTTPException(status_code=400, detail="Invalid message format")
+            reason = stream_manager.process_message(message)
+        # Return response using uid from header
+        response = {
+            "id": uid,
+            "choices": {
+                "role": "assistant",
+                "content": "success",
+                "finish_reason": reason
+            }
+        }
+        return JSONResponse(content=response, status_code=200)
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=400, detail="Invalid JSON")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.websocket("/ws/stream")
+@app.websocket("/ws/api/v1/stream")
+async def websocket_stream(websocket: WebSocket,
+                           uid: Optional[str] = Query(None)):
+    global stream_manager
+    if not uid:
+        await websocket.close(code=400, reason="Missing uid in request")
+        return
+    # Accept the WebSocket connection
+    await websocket.accept()
+    #if stream_manager.uid is not None and stream_manager.uid != uid:
+    #    logger.error(f"uid changed during steram: previous uid {stream_manager.uid}, new uid {uid}")
+    #    await websocket.close(code=400, reason="Uid changed in stream.")
+    #    return
+    try:
+        while True:
+           # Continuously listen for incoming messages from the client
+           data = await websocket.receive_text()
+           # Parse JSON request
+           try:
+               request_data = json.loads(data)
+           except json.JSONDecodeError:
+               await websocket.send_json({"error": "Invalid JSON"})
+               continue
+           stream_manager.update_last_request_time()
+           stream_manager.update_last_stream_time()
+           if stream_manager.uid is not None and stream_manager.uid != uid:
+               logger.error(f"uid changed during stream: previous uid {stream_manager.uid}, new uid {uid}")
+               await websocket.send_json({"error": "UID changed in stream"})
+               continue
+           # Validate basic structure
+           if not isinstance(request_data, dict) or "messages" not in request_data:
+               await websocket.send_json({"error": "Invalid request format"})
+               continue
+           # Process messages
+           try:
+               reason = ""
+               for message in request_data["messages"]:
+                   if not isinstance(message, dict) or "role" not in message or "content" not in message:
+                       await websocket.send_json({"error": "Invalid message format"})
+                       continue
+                   reason = stream_manager.process_message(message)
+               # Respond with success message
+               response = {
+                   "id": uid,
+                   "choices": {
+                       "role": "assistant",
+                       "content": "success",
+                       "finish_reason": reason,
+                   },
+               }
+               await websocket.send_json(response)
+           except WebSocketDisconnect:
+               # Handle WebSocket disconnection
+               break
+           except Exception as e:
+               logger.error(f"process message error: {str(e)}")
+               await websocket.close(code=1011, reason =f"Internal server error: {str(e)}")
+    except WebSocketDisconnect:
+        # Handle WebSocket disconnection
+        return
+    except Exception as e:
+        logger.error(f"ws_stream error: {str(e)}")
+        await websocket.close(code=1011, reason =f"Unexpected error: {str(e)}")
+async def generate_sse_response(request: Request, uid: Optional[str] = Header(None)):
+    global stream_manager
+    print(f"uid: {uid}")
+    try:
+        # Wait for streaming to complete or timeout
+        while not stream_manager.is_streaming_complete.is_set():
+            # if stream_manager.is_timed_out():
+            #     yield f"data: {json.dumps({'error': 'Stream timeout'})}\n\n"
+            #     return
+            # print(f"{uid} whille not stream_manager.is_streaming_complete.is_set(), asyncio.sleep(0.1)")
+            await asyncio.sleep(0.1)
+        logger.info("streaming complete\n")
+        # Generate response
+        try:
+            yield f"event: message\n"
+            async for audio, text in stream_manager.generate():
+                if text == "stop":
+                    break
+                res = {
+                    "id": stream_manager.uid,
+                    "response_id": stream_manager.output_audio_id,
+                    "choices": [
+                        {
+                            "role": "assistant",
+                            "audio": audio,
+                            "text": text,
+                            "finish_reason": "processing"
+                        }
+                    ]
+                }
+                # logger.info("generate_sse_response yield response")
+                yield f"data: {json.dumps(res)}\n\n"
+                await asyncio.sleep(0)
+        except Exception as e:
+            logger.error(f"Error while generation: {str(e)}")
+            yield f'data:{{"error": "{str(exc)}"}}\n\n'
+    except Exception as e:
+        yield f'data:{{"error": "{str(e)}"}}\n\n'
+@app.post("/completions")
+@app.post("/api/v1/completions")
+async def completions(request: Request, uid: Optional[str] = Header(None)):
+    global stream_manager
+    if not uid:
+        raise HTTPException(status_code=400, detail="Missing uid in headers")
+    try:
+        # if stream_manager.uid is not None and stream_manager.uid != uid:
+        if stream_manager.uid != uid:
+        #     stream_manager.stop_response = True
+        #     logger.info(f"uid changed, reset model: previous uid {stream_manager.uid}, new uid {uid}")
+            stream_manager.session_id += 1
+            stream_manager.sys_prompt_flag = False
+            stream_manager.reset()
+            # raise HTTPException(
+            #    status_code=409,
+            #    detail="User id changed, reset context."
+            # )
+        stream_manager.speaking_time_stamp = 0
+        stream_manager.update_last_request_time()
+        stream_manager.uid = uid
+        stream_manager.start_conversation()
+        data = await request.json()
+        return StreamingResponse(
+            generate_sse_response(request, uid),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "Transfer-Encoding": "chunked"
+            }
+        )
+    except asyncio.TimeoutError:
+        raise HTTPException(
+            status_code=503,
+            detail="Server busy, please try again later"
+        )
+    except Exception as e:
+        logger.error(f"Error processing request for user {uid}: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/stop")
+@app.post("/api/v1/stop")
+async def stop_response(request: Request, uid: Optional[str] = Header(None)):
+    if not uid:
+        raise HTTPException(status_code=400, detail="Missing uid in headers")
+    global stream_manager
+    # stream_manager.session_id += 1
+    logger.info(f"uid {uid}: received stop_response")
+    stream_manager.stop_response = True
+    response = {
+        "id": uid,
+        "choices": {
+            "role": "assistant",
+            "content": "success",
+            "finish_reason": "stop"
+        }
+    }
+    return JSONResponse(content=response, status_code=200)
+@app.post("/feedback")
+@app.post("/api/v1/feedback")
+async def feedback(request: Request, uid: Optional[str] = Header(None)):
+    global stream_manager
+    # Validate the 'uid' header
+    if not uid:
+        raise HTTPException(status_code=400, detail="Missing 'uid' header")
+    try:
+        data = await request.json()
+        if "response_id" not in data or "rating" not in data:
+            raise HTTPException(status_code=400, detail="Invalid request: must have response_id and rating")
+        response_id = data.get("response_id", "")
+        rating = data.get("rating", "")
+        comment = data.get("comment", "")
+        # Validate the rating
+        if rating not in ["like", "dislike"]:
+            raise HTTPException(status_code=400, detail=f"Invalid rating value: {rating}")
+        # Define the log file path
+        log_file_path = f"{stream_manager.savedir}/feedback_log/{response_id}.{rating}"
+        # Write the feedback to the file asynchronously
+        async with aiofiles.open(log_file_path, mode="a") as file:
+            await file.write(f"model: {stream_manager.minicpmo_model_path}\nuid {uid}: {comment}\n")
+        response = {
+            "id": uid,
+            "choices": {
+                "role": "assistant",
+                "content": "success",
+                "finish_reason": "done"
+            }
+        }
+        return JSONResponse(content=response, status_code=200)
+    except Exception as e:
+        logger.error(f"Error processing feedback for user {uid}: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/init_options")
+@app.post("/api/v1/init_options")
+async def init_options(request: Request, uid: Optional[str] = Header(None)):
+    global stream_manager
+    stream_manager.update_last_request_time()
+    if not uid:
+        raise HTTPException(status_code=400, detail="Missing uid in headers")
+    try:
+        # Parse JSON request
+        data = await request.json()
+        # Validate basic structure
+        if not isinstance(data, dict) or "messages" not in data:
+            raise HTTPException(status_code=400, detail="Invalid request format")
+        messages = data.get("messages", [])
+        for message in messages:
+            if not isinstance(message, dict) or "role" not in message or "content" not in message:
+                raise HTTPException(status_code=400, detail="Invalid message format")
+            for content in message.get("content", []):
+                if content["type"] == "input_audio":
+                    audio_data = content["input_audio"].get("data", "")
+                    audio_fmt = content["input_audio"].get("format", "")
+                    stream_manager.upload_customized_audio(audio_data, audio_fmt)
+                elif content["type"] == "options":
+                    stream_manager.update_customized_options(uid, content["options"])
+                else:
+                    ctype = content["type"]
+                    raise HTTPException(status_code=400, detail=f"Invalid content type: {ctype}")
+        version = stream_manager.model_version
+        print(version)
+        response = {
+            "id": uid,
+            "choices": {
+                "role": "assistant",
+                "content": version,
+                "finish_reason": "done"
+            }
+        }
+        return JSONResponse(content=response, status_code=200)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"init options error: {str(e)}")
+@app.get('/health')
+@app.get('/api/v1/health')
+async def health_check():
+    return {"status": "OK"}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=args.port)

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/vad_utils.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import functools
+import numpy as np
+import librosa
+import os
+import time
+import traceback
+from typing import List, NamedTuple, Optional
+class VadOptions(NamedTuple):
+    """VAD options.
+    Attributes:
+      threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
+        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
+        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+      min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
+      max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
+        than max_speech_duration_s will be split at the timestamp of the last silence that
+        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
+        split aggressively just before max_speech_duration_s.
+      min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
+        before separating it
+      window_size_samples: Audio chunks of window_size_samples size are fed to the silero VAD model.
+        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
+        Values other than these may affect model performance!!
+      speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
+    """
+    # threshold: float = 0.3 # rep 0.5
+    # min_speech_duration_ms: int = 250
+    # max_speech_duration_s: float = float("inf")
+    # min_silence_duration_ms: int = 2000
+    # window_size_samples: int = 1024
+    # speech_pad_ms: int = 600 # rep 400
+    threshold: float = 0.7 # gw: 0.3 # rep 0.5
+    min_speech_duration_ms: int = 128  # original & gw: 250
+    max_speech_duration_s: float = float("inf")
+    min_silence_duration_ms: int = 500 # original & gw: 2000
+    window_size_samples: int = 1024
+    speech_pad_ms: int = 30 # gw: 600 # rep 400
+class SileroVADModel:
+    def __init__(self, path):
+        try:
+            import onnxruntime
+        except ImportError as e:
+            raise RuntimeError(
+                "Applying the VAD filter requires the onnxruntime package"
+            ) from e
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        opts.log_severity_level = 4
+        self.session = onnxruntime.InferenceSession(
+            path,
+            providers=["CPUExecutionProvider"],
+            sess_options=opts,
+        )
+    def get_initial_state(self, batch_size: int):
+        h = np.zeros((2, batch_size, 64), dtype=np.float32)
+        c = np.zeros((2, batch_size, 64), dtype=np.float32)
+        return h, c
+    def __call__(self, x, state, sr: int):
+        if len(x.shape) == 1:
+            x = np.expand_dims(x, 0)
+        if len(x.shape) > 2:
+            raise ValueError(
+                f"Too many dimensions for input audio chunk {len(x.shape)}"
+            )
+        if sr / x.shape[1] > 31.25:
+            raise ValueError("Input audio chunk is too short")
+        h, c = state
+        ort_inputs = {
+            "input": x,
+            #"state": np.concatenate((h, c), axis=0),
+            "h": h,
+            "c": c,
+            "sr": np.array(sr, dtype="int64"),
+        }
+        out, h, c = self.session.run(None, ort_inputs)
+        #out = self.session.run(None, ort_inputs)
+        state = (h, c)
+        return out, state
+@functools.lru_cache
+def get_vad_model():
+    """Returns the VAD model instance."""
+    path = os.path.join(os.path.dirname(__file__), "silero_vad.onnx")
+    return SileroVADModel(path)
+def get_speech_timestamps(
+    audio: np.ndarray,
+    vad_options: Optional[VadOptions] = None,
+    **kwargs,
+) -> List[dict]:
+    """This method is used for splitting long audios into speech chunks using silero VAD.
+    Args:
+      audio: One dimensional float array.
+      vad_options: Options for VAD processing.
+      kwargs: VAD options passed as keyword arguments for backward compatibility.
+    Returns:
+      List of dicts containing begin and end samples of each speech chunk.
+    """
+    if vad_options is None:
+        vad_options = VadOptions(**kwargs)
+    threshold = vad_options.threshold
+    min_speech_duration_ms = vad_options.min_speech_duration_ms
+    max_speech_duration_s = vad_options.max_speech_duration_s
+    min_silence_duration_ms = vad_options.min_silence_duration_ms
+    window_size_samples = vad_options.window_size_samples
+    speech_pad_ms = vad_options.speech_pad_ms
+    if window_size_samples not in [512, 1024, 1536]:
+        warnings.warn(
+            "Unusual window_size_samples! Supported window_size_samples:\n"
+            " - [512, 1024, 1536] for 16000 sampling_rate"
+        )
+    sampling_rate = 16000
+    min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 #如果间隔区间没这个长度就不会添加
+    speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+    max_speech_samples = (
+        sampling_rate * max_speech_duration_s
+        - window_size_samples
+        - 2 * speech_pad_samples
+    )
+    min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 # 在每个silent需要等 min_silence_duration_ms 后才结束，
+    min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 # 0.098s # need to adjust？
+    audio_length_samples = len(audio)
+    # import pdb
+    # pdb.set_trace()
+    model = get_vad_model()
+    state = model.get_initial_state(batch_size=1)
+    speech_probs = []
+    #print("audio_length_samples ", audio_length_samples, ", window_size_samples ", window_size_samples)
+    for current_start_sample in range(0, audio_length_samples, window_size_samples):
+        chunk = audio[current_start_sample : current_start_sample + window_size_samples]
+        if len(chunk) < window_size_samples:
+            chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+        speech_prob, state = model(chunk, state, sampling_rate)
+        speech_probs.append(speech_prob)
+    triggered = False
+    speeches = []
+    current_speech = {}
+    neg_threshold = threshold - 0.15
+    # to save potential segment end (and tolerate some silence)
+    temp_end = 0
+    # to save potential segment limits in case of maximum segment size reached
+    prev_end = next_start = 0
+    # 大概是一段音频找出其中连续部分，如果遇到silent的话会先记录temp_end，然后如果没超过最小silent长度遇到active的情况下会重置temp_end。silent片段会分别记录silent的起终，在超过长度的时候切开（不完全确定，但是inf的最大长也遇不到）
+    for i, speech_prob in enumerate(speech_probs):
+        if (speech_prob >= threshold) and temp_end:
+            temp_end = 0
+            if next_start < prev_end:
+                next_start = window_size_samples * i
+        if (speech_prob >= threshold) and not triggered:
+            triggered = True
+            current_speech["start"] = window_size_samples * i
+            continue
+        if (
+            triggered
+            and (window_size_samples * i) - current_speech["start"] > max_speech_samples
+        ):
+            if prev_end:
+                current_speech["end"] = prev_end
+                speeches.append(current_speech)
+                current_speech = {}
+                # previously reached silence (< neg_thres) and is still not speech (< thres)
+                if next_start < prev_end:
+                    triggered = False
+                else:
+                    current_speech["start"] = next_start
+                prev_end = next_start = temp_end = 0
+            else:
+                current_speech["end"] = window_size_samples * i
+                speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue
+        if (speech_prob < neg_threshold) and triggered:
+            if not temp_end:
+                temp_end = window_size_samples * i
+            # condition to avoid cutting in very short silence
+            if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
+                prev_end = temp_end
+            if (window_size_samples * i) - temp_end < min_silence_samples:
+                continue
+            else:
+                current_speech["end"] = temp_end
+                if (
+                    current_speech["end"] - current_speech["start"]
+                ) > min_speech_samples:
+                    speeches.append(current_speech)
+                current_speech = {}
+                prev_end = next_start = temp_end = 0
+                triggered = False
+                continue
+    if (
+        current_speech
+        and (audio_length_samples - current_speech["start"]) > min_speech_samples
+    ):
+        current_speech["end"] = audio_length_samples
+        speeches.append(current_speech)
+    # pad 多少ms，每个中间都会不足平分
+    for i, speech in enumerate(speeches):
+        if i == 0:
+            speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
+        if i != len(speeches) - 1:
+            silence_duration = speeches[i + 1]["start"] - speech["end"]
+            if silence_duration < 2 * speech_pad_samples:
+                speech["end"] += int(silence_duration // 2)
+                speeches[i + 1]["start"] = int(
+                    max(0, speeches[i + 1]["start"] - silence_duration // 2)
+                )
+            else:
+                speech["end"] = int(
+                    min(audio_length_samples, speech["end"] + speech_pad_samples)
+                )
+                speeches[i + 1]["start"] = int(
+                    max(0, speeches[i + 1]["start"] - speech_pad_samples)
+                )
+        else:
+            speech["end"] = int(
+                min(audio_length_samples, speech["end"] + speech_pad_samples)
+            )
+    return speeches
+def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+    """Collects and concatenates audio chunks."""
+    if not chunks:
+        return np.array([], dtype=np.float32)
+    return np.concatenate([audio[chunk["start"] : chunk["end"]] for chunk in chunks])
+def run_vad(ori_audio, sr, vad_options=None):
+    _st = time.time()
+    try:
+        audio = np.frombuffer(ori_audio, dtype=np.int16)
+        audio = audio.astype(np.float32) / 32768.0
+        sampling_rate = 16000
+        if sr != sampling_rate:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
+        # print('audio.encode.shape: {}'.format(audio.shape))
+        if vad_options is None:
+            vad_options = VadOptions()
+        # 确保传递给 get_speech_timestamps 的是 VadOptions 实例
+        speech_chunks = get_speech_timestamps(audio, vad_options=vad_options)
+        # print(speech_chunks)
+        audio = collect_chunks(audio, speech_chunks)
+        # print(audio.shape)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        # print('audio.decode.shape: {}'.format(audio.shape))
+        if sr != sampling_rate:
+            # resample to original sampling rate
+            vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
+        else:
+            vad_audio = audio
+        vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
+        # 这个round会有一定的误差
+        vad_audio_bytes = vad_audio.tobytes()
+        return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
+    except Exception as e:
+        msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
+        print(msg)
+        return -1, ori_audio, round(time.time() - _st, 4)

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.env.development ADDED Viewed

File without changes

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.env.production ADDED Viewed

File without changes

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.eslintrc-auto-import.json ADDED Viewed

	@@ -0,0 +1,359 @@

+{
+  "globals": {
+    "Component": true,
+    "ComponentPublicInstance": true,
+    "ComputedRef": true,
+    "EffectScope": true,
+    "ExtractDefaultPropTypes": true,
+    "ExtractPropTypes": true,
+    "ExtractPublicPropTypes": true,
+    "InjectionKey": true,
+    "LegalTypeEnum": true,
+    "LoginTypeEnum": true,
+    "PropType": true,
+    "Ref": true,
+    "VNode": true,
+    "WritableComputedRef": true,
+    "acceptHMRUpdate": true,
+    "ajaxHeader": true,
+    "asyncComputed": true,
+    "authLogin": true,
+    "autoResetRef": true,
+    "computed": true,
+    "computedAsync": true,
+    "computedEager": true,
+    "computedInject": true,
+    "computedWithControl": true,
+    "controlledComputed": true,
+    "controlledRef": true,
+    "createApp": true,
+    "createEventHook": true,
+    "createGlobalState": true,
+    "createInjectionState": true,
+    "createPinia": true,
+    "createReactiveFn": true,
+    "createReusableTemplate": true,
+    "createSharedComposable": true,
+    "createTemplatePromise": true,
+    "createUnrefFn": true,
+    "customRef": true,
+    "debouncedRef": true,
+    "debouncedWatch": true,
+    "defineAsyncComponent": true,
+    "defineComponent": true,
+    "defineStore": true,
+    "eagerComputed": true,
+    "effectScope": true,
+    "extendRef": true,
+    "fetchSmsVerifyCode": true,
+    "getActivePinia": true,
+    "getCurrentInstance": true,
+    "getCurrentScope": true,
+    "getHomeInfo": true,
+    "h": true,
+    "ignorableWatch": true,
+    "inject": true,
+    "injectLocal": true,
+    "isDefined": true,
+    "isProxy": true,
+    "isReactive": true,
+    "isReadonly": true,
+    "isRef": true,
+    "loginSuccess": true,
+    "makeDestructurable": true,
+    "mapActions": true,
+    "mapGetters": true,
+    "mapState": true,
+    "mapStores": true,
+    "mapWritableState": true,
+    "markRaw": true,
+    "nextTick": true,
+    "onActivated": true,
+    "onBeforeMount": true,
+    "onBeforeRouteLeave": true,
+    "onBeforeRouteUpdate": true,
+    "onBeforeUnmount": true,
+    "onBeforeUpdate": true,
+    "onClickOutside": true,
+    "onDeactivated": true,
+    "onErrorCaptured": true,
+    "onKeyStroke": true,
+    "onLongPress": true,
+    "onMounted": true,
+    "onRenderTracked": true,
+    "onRenderTriggered": true,
+    "onScopeDispose": true,
+    "onServerPrefetch": true,
+    "onStartTyping": true,
+    "onUnmounted": true,
+    "onUpdated": true,
+    "pausableWatch": true,
+    "provide": true,
+    "provideLocal": true,
+    "reactify": true,
+    "reactifyObject": true,
+    "reactive": true,
+    "reactiveComputed": true,
+    "reactiveOmit": true,
+    "reactivePick": true,
+    "readonly": true,
+    "ref": true,
+    "refAutoReset": true,
+    "refDebounced": true,
+    "refDefault": true,
+    "refThrottled": true,
+    "refWithControl": true,
+    "resolveComponent": true,
+    "resolveRef": true,
+    "resolveUnref": true,
+    "setActivePinia": true,
+    "setMapStoreSuffix": true,
+    "setupStore": true,
+    "shallowReactive": true,
+    "shallowReadonly": true,
+    "shallowRef": true,
+    "store": true,
+    "storeToRefs": true,
+    "submitFeedback": true,
+    "syncRef": true,
+    "syncRefs": true,
+    "templateRef": true,
+    "throttledRef": true,
+    "throttledWatch": true,
+    "toRaw": true,
+    "toReactive": true,
+    "toRef": true,
+    "toRefs": true,
+    "toValue": true,
+    "triggerRef": true,
+    "tryOnBeforeMount": true,
+    "tryOnBeforeUnmount": true,
+    "tryOnMounted": true,
+    "tryOnScopeDispose": true,
+    "tryOnUnmounted": true,
+    "unref": true,
+    "unrefElement": true,
+    "until": true,
+    "useActiveElement": true,
+    "useAnimate": true,
+    "useArrayDifference": true,
+    "useArrayEvery": true,
+    "useArrayFilter": true,
+    "useArrayFind": true,
+    "useArrayFindIndex": true,
+    "useArrayFindLast": true,
+    "useArrayIncludes": true,
+    "useArrayJoin": true,
+    "useArrayMap": true,
+    "useArrayReduce": true,
+    "useArraySome": true,
+    "useArrayUnique": true,
+    "useAsyncQueue": true,
+    "useAsyncState": true,
+    "useAttrs": true,
+    "useBase64": true,
+    "useBattery": true,
+    "useBluetooth": true,
+    "useBreakpoints": true,
+    "useBroadcastChannel": true,
+    "useBrowserLocation": true,
+    "useCached": true,
+    "useClearLocalCache": true,
+    "useClipboard": true,
+    "useClipboardItems": true,
+    "useCloned": true,
+    "useColorMode": true,
+    "useConfirmDialog": true,
+    "useCounter": true,
+    "useCssModule": true,
+    "useCssVar": true,
+    "useCssVars": true,
+    "useCurrentElement": true,
+    "useCycleList": true,
+    "useDark": true,
+    "useDateFormat": true,
+    "useDebounce": true,
+    "useDebounceFn": true,
+    "useDebouncedRefHistory": true,
+    "useDeviceMotion": true,
+    "useDeviceOrientation": true,
+    "useDevicePixelRatio": true,
+    "useDevicesList": true,
+    "useDisplayMedia": true,
+    "useDocumentVisibility": true,
+    "useDraggable": true,
+    "useDropZone": true,
+    "useElementBounding": true,
+    "useElementByPoint": true,
+    "useElementHover": true,
+    "useElementSize": true,
+    "useElementVisibility": true,
+    "useEventBus": true,
+    "useEventListener": true,
+    "useEventSource": true,
+    "useEyeDropper": true,
+    "useFavicon": true,
+    "useFetch": true,
+    "useFetchLogin": true,
+    "useFileDialog": true,
+    "useFileSystemAccess": true,
+    "useFocus": true,
+    "useFocusWithin": true,
+    "useFps": true,
+    "useFullscreen": true,
+    "useGamepad": true,
+    "useGeolocation": true,
+    "useGetLocalCache": true,
+    "useHttp": true,
+    "useIdle": true,
+    "useImage": true,
+    "useInfiniteScroll": true,
+    "useIntersectionObserver": true,
+    "useInterval": true,
+    "useIntervalFn": true,
+    "useKeyModifier": true,
+    "useLastChanged": true,
+    "useLegal": true,
+    "useLink": true,
+    "useLocalStorage": true,
+    "useLogin": true,
+    "useMagicKeys": true,
+    "useManualRefHistory": true,
+    "useMediaControls": true,
+    "useMediaQuery": true,
+    "useMemoize": true,
+    "useMemory": true,
+    "useMounted": true,
+    "useMouse": true,
+    "useMouseInElement": true,
+    "useMousePressed": true,
+    "useMutationObserver": true,
+    "useNavigatorLanguage": true,
+    "useNetwork": true,
+    "useNow": true,
+    "useObjectUrl": true,
+    "useOffsetPagination": true,
+    "useOnline": true,
+    "usePageLeave": true,
+    "useParallax": true,
+    "useParentElement": true,
+    "usePerformanceObserver": true,
+    "usePermission": true,
+    "usePointer": true,
+    "usePointerLock": true,
+    "usePointerSwipe": true,
+    "usePreferredColorScheme": true,
+    "usePreferredContrast": true,
+    "usePreferredDark": true,
+    "usePreferredLanguages": true,
+    "usePreferredReducedMotion": true,
+    "usePrevious": true,
+    "useRafFn": true,
+    "useRefHistory": true,
+    "useResizeObserver": true,
+    "useRoute": true,
+    "useRouter": true,
+    "useScreenOrientation": true,
+    "useScreenSafeArea": true,
+    "useScriptTag": true,
+    "useScroll": true,
+    "useScrollLock": true,
+    "useSessionStorage": true,
+    "useSetLocalCache": true,
+    "useShare": true,
+    "useSlots": true,
+    "useSorted": true,
+    "useSpeechRecognition": true,
+    "useSpeechSynthesis": true,
+    "useStepper": true,
+    "useStorage": true,
+    "useStorageAsync": true,
+    "useStyleTag": true,
+    "useSupported": true,
+    "useSwipe": true,
+    "useTemplateRefsList": true,
+    "useTextDirection": true,
+    "useTextSelection": true,
+    "useTextareaAutosize": true,
+    "useThrottle": true,
+    "useThrottleFn": true,
+    "useThrottledRefHistory": true,
+    "useTimeAgo": true,
+    "useTimeout": true,
+    "useTimeoutFn": true,
+    "useTimeoutPoll": true,
+    "useTimestamp": true,
+    "useTitle": true,
+    "useToNumber": true,
+    "useToString": true,
+    "useToggle": true,
+    "useTransition": true,
+    "useUrlSearchParams": true,
+    "useUserMedia": true,
+    "useUserStore": true,
+    "useUserStoreWithOut": true,
+    "useVModel": true,
+    "useVModels": true,
+    "useVibrate": true,
+    "useVirtualList": true,
+    "useWakeLock": true,
+    "useWebNotification": true,
+    "useWebSocket": true,
+    "useWebWorker": true,
+    "useWebWorkerFn": true,
+    "useWindowFocus": true,
+    "useWindowScroll": true,
+    "useWindowSize": true,
+    "watch": true,
+    "watchArray": true,
+    "watchAtMost": true,
+    "watchDebounced": true,
+    "watchDeep": true,
+    "watchEffect": true,
+    "watchIgnorable": true,
+    "watchImmediate": true,
+    "watchOnce": true,
+    "watchPausable": true,
+    "watchPostEffect": true,
+    "watchSyncEffect": true,
+    "watchThrottled": true,
+    "watchTriggerable": true,
+    "watchWithFilter": true,
+    "whenever": true,
+    "ElMessage": true,
+    "ElLoading": true,
+    "deleteHistoryBatch": true,
+    "deleteHistoryItem": true,
+    "getHistory": true,
+    "createConv": true,
+    "fetchHistoryList": true,
+    "stopChat": true,
+    "useChatStore": true,
+    "useChatStoreWithOut": true,
+    "useChatExchangeStore": true,
+    "useChatExchangeStoreWithOut": true,
+    "useExchangeStore": true,
+    "useExchangeStoreWithOut": true,
+    "delMessage": true,
+    "sendRating": true,
+    "getInitialActions": true,
+    "sendFeedback": true,
+    "md": true,
+    "useMarkdown": true,
+    "connectService": true,
+    "sendMessage": true,
+    "Audio": true,
+    "SoundRecording": true,
+    "getVolume": true,
+    "ElMessageBox": true,
+    "encodeWav": true,
+    "encodeWAV": true,
+    "stopMessage": true,
+    "TaskQueue": true,
+    "getNewUserId": true,
+    "setNewUserId": true,
+    "uploadFile": true,
+    "feedback": true,
+    "uploadConfig": true
+  }
+}

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/minicpm-o_2.6/web_server/.eslintrc.cjs ADDED Viewed

	@@ -0,0 +1,26 @@

+/* eslint-env node */
+require('@rushstack/eslint-patch/modern-module-resolution');
+module.exports = {
+    root: true,
+    extends: [
+        'plugin:vue/vue3-essential',
+        'eslint:recommended',
+        '@vue/eslint-config-prettier/skip-formatting',
+        './.eslintrc-auto-import.json',
+    ],
+    parserOptions: {
+        ecmaVersion: 'latest',
+    },
+    rules: {
+        'no-console': process.env.NODE_ENV === 'production' ? 'off' : 'warn',
+        'no-debugger': process.env.NODE_ENV === 'production' ? 'error' : 'warn',
+        'no-var': process.env.NODE_ENV === 'production' ? 'off' : 'warn',
+        'no-undef': process.env.NODE_ENV === 'production' ? 'error' : 'warn',
+        'vue/multi-word-component-names': 'off', // 不校验组件名
+        'no-empty': 0, // 允许代码块为空
+        'vue/no-unused-components': 'warn',
+        'no-unused-vars': 'warn',
+        'prettier/prettier': 'off', // 不符合prettier格式规范的编码eslint直接自动报错
+    },
+};

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo.py ADDED Viewed

	@@ -0,0 +1,264 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import gradio as gr
+from PIL import Image
+import traceback
+import re
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+# README, How to run demo on different devices
+# For Nvidia GPUs support BF16 (like A100, H100, RTX3090)
+# python web_demo.py --device cuda --dtype bf16
+# For Nvidia GPUs do NOT support BF16 (like V100, T4, RTX2080)
+# python web_demo.py --device cuda --dtype fp16
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo.py --device mps --dtype fp16
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+parser.add_argument('--dtype', type=str, default='bf16', help='bf16 or fp16')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+if args.dtype == 'bf16':
+    if device == 'mps':
+        print('Warning: MPS does not support bf16, will use fp16 instead')
+        dtype = torch.float16
+    else:
+        dtype = torch.bfloat16
+else:
+    dtype = torch.float16
+# Load model
+model_path = 'openbmb/MiniCPM-V-2'
+model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = model.to(device=device, dtype=dtype)
+model.eval()
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.0'
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+# Beam Form
+num_beams_slider = {
+    'minimum': 0,
+    'maximum': 5,
+    'value': 3,
+    'step': 1,
+    'interactive': True,
+    'label': 'Num Beams'
+}
+repetition_penalty_slider = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.2,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+repetition_penalty_slider2 = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.05,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+max_new_tokens_slider = {
+    'minimum': 1,
+    'maximum': 4096,
+    'value': 1024,
+    'step': 1,
+    'interactive': True,
+    'label': 'Max New Tokens'
+}
+top_p_slider = {
+    'minimum': 0,
+    'maximum': 1,
+    'value': 0.8,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Top P'
+}
+top_k_slider = {
+    'minimum': 0,
+    'maximum': 200,
+    'value': 100,
+    'step': 1,
+    'interactive': True,
+    'label': 'Top K'
+}
+temperature_slider = {
+    'minimum': 0,
+    'maximum': 2,
+    'value': 0.7,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Temperature'
+}
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024}
+    if params is None:
+        params = default_params
+    if img is None:
+        return -1, "Error, invalid image, please upload a new image", None, None
+    try:
+        image = img.convert('RGB')
+        answer, context, _ = model.chat(
+            image=image,
+            msgs=msgs,
+            context=None,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        return 0, answer, None, None
+    except Exception as err:
+        print(err)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+def upload_img(image, _chatbot, _app_session):
+    image = Image.fromarray(image)
+    _app_session['sts']=None
+    _app_session['ctx']=[]
+    _app_session['img']=image
+    _chatbot.append(('', 'Image uploaded successfully, you can talk to me now'))
+    return _chatbot, _app_session
+def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if _app_cfg.get('ctx', None) is None:
+        _chat_bot.append((_question, 'Please upload an image to start'))
+        return '', _chat_bot, _app_cfg
+    _context = _app_cfg['ctx'].copy()
+    if _context:
+        _context.append({"role": "user", "content": _question})
+    else:
+        _context = [{"role": "user", "content": _question}]
+    print('<User>:', _question)
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': num_beams,
+            'repetition_penalty': repetition_penalty,
+            "max_new_tokens": 896
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': top_p,
+            'top_k': top_k,
+            'temperature': temperature,
+            'repetition_penalty': repetition_penalty_2,
+            "max_new_tokens": 896
+        }
+    code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params)
+    print('<Assistant>:', _answer)
+    _context.append({"role": "assistant", "content": _answer})
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return '', _chat_bot, _app_cfg
+def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if len(_chat_bot) <= 1:
+        _chat_bot.append(('Regenerate', 'No question for regeneration.'))
+        return '', _chat_bot, _app_cfg
+    elif _chat_bot[-1][0] == 'Regenerate':
+        return '', _chat_bot, _app_cfg
+    else:
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+    return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature)
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            params_form = create_component(form_radio, comp='Radio')
+            with gr.Accordion("Beam Search") as beams_according:
+                num_beams = create_component(num_beams_slider)
+                repetition_penalty = create_component(repetition_penalty_slider)
+            with gr.Accordion("Sampling") as sampling_according:
+                top_p = create_component(top_p_slider)
+                top_k = create_component(top_k_slider)
+                temperature = create_component(temperature_slider)
+                repetition_penalty_2 = create_component(repetition_penalty_slider2)
+            regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+        with gr.Column(scale=3, min_width=500):
+            app_session = gr.State({'sts':None,'ctx':None,'img':None})
+            bt_pic = gr.Image(label="Upload an image to start")
+            chat_bot = gr.Chatbot(label=f"Chat with {model_name}")
+            txt_message = gr.Textbox(label="Input text")
+            regenerate.click(
+                regenerate_button_clicked,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            txt_message.submit(
+                respond,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            bt_pic.upload(lambda: None, None, chat_bot, queue=False).then(upload_img, inputs=[bt_pic,chat_bot,app_session], outputs=[chat_bot,app_session])
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8080, server_name="0.0.0.0")

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_2.5.py ADDED Viewed

	@@ -0,0 +1,256 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import gradio as gr
+from PIL import Image
+import traceback
+import re
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+# README, How to run demo on different devices
+# For Nvidia GPUs.
+# python web_demo_2.5.py --device cuda
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.5.py --device mps
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+# Load model
+model_path = 'openbmb/MiniCPM-Llama3-V-2_5'
+if 'int4' in model_path:
+    if device == 'mps':
+        print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
+        exit()
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+else:
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16, device_map=device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.5'
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+# Beam Form
+num_beams_slider = {
+    'minimum': 0,
+    'maximum': 5,
+    'value': 3,
+    'step': 1,
+    'interactive': True,
+    'label': 'Num Beams'
+}
+repetition_penalty_slider = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.2,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+repetition_penalty_slider2 = {
+    'minimum': 0,
+    'maximum': 3,
+    'value': 1.05,
+    'step': 0.01,
+    'interactive': True,
+    'label': 'Repetition Penalty'
+}
+max_new_tokens_slider = {
+    'minimum': 1,
+    'maximum': 4096,
+    'value': 1024,
+    'step': 1,
+    'interactive': True,
+    'label': 'Max New Tokens'
+}
+top_p_slider = {
+    'minimum': 0,
+    'maximum': 1,
+    'value': 0.8,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Top P'
+}
+top_k_slider = {
+    'minimum': 0,
+    'maximum': 200,
+    'value': 100,
+    'step': 1,
+    'interactive': True,
+    'label': 'Top K'
+}
+temperature_slider = {
+    'minimum': 0,
+    'maximum': 2,
+    'value': 0.7,
+    'step': 0.05,
+    'interactive': True,
+    'label': 'Temperature'
+}
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    default_params = {"num_beams":3, "repetition_penalty": 1.2, "max_new_tokens": 1024}
+    if params is None:
+        params = default_params
+    if img is None:
+        return -1, "Error, invalid image, please upload a new image", None, None
+    try:
+        image = img.convert('RGB')
+        answer = model.chat(
+            image=image,
+            msgs=msgs,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        return 0, answer, None, None
+    except Exception as err:
+        print(err)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+def upload_img(image, _chatbot, _app_session):
+    image = Image.fromarray(image)
+    _app_session['sts']=None
+    _app_session['ctx']=[]
+    _app_session['img']=image
+    _chatbot.append(('', 'Image uploaded successfully, you can talk to me now'))
+    return _chatbot, _app_session
+def respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if _app_cfg.get('ctx', None) is None:
+        _chat_bot.append((_question, 'Please upload an image to start'))
+        return '', _chat_bot, _app_cfg
+    _context = _app_cfg['ctx'].copy()
+    if _context:
+        _context.append({"role": "user", "content": _question})
+    else:
+        _context = [{"role": "user", "content": _question}]
+    print('<User>:', _question)
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': num_beams,
+            'repetition_penalty': repetition_penalty,
+            "max_new_tokens": 896
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': top_p,
+            'top_k': top_k,
+            'temperature': temperature,
+            'repetition_penalty': repetition_penalty_2,
+            "max_new_tokens": 896
+        }
+    code, _answer, _, sts = chat(_app_cfg['img'], _context, None, params)
+    print('<Assistant>:', _answer)
+    _context.append({"role": "assistant", "content": _answer})
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return '', _chat_bot, _app_cfg
+def regenerate_button_clicked(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature):
+    if len(_chat_bot) <= 1:
+        _chat_bot.append(('Regenerate', 'No question for regeneration.'))
+        return '', _chat_bot, _app_cfg
+    elif _chat_bot[-1][0] == 'Regenerate':
+        return '', _chat_bot, _app_cfg
+    else:
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+    return respond(_question, _chat_bot, _app_cfg, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature)
+with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            params_form = create_component(form_radio, comp='Radio')
+            with gr.Accordion("Beam Search") as beams_according:
+                num_beams = create_component(num_beams_slider)
+                repetition_penalty = create_component(repetition_penalty_slider)
+            with gr.Accordion("Sampling") as sampling_according:
+                top_p = create_component(top_p_slider)
+                top_k = create_component(top_k_slider)
+                temperature = create_component(temperature_slider)
+                repetition_penalty_2 = create_component(repetition_penalty_slider2)
+            regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+        with gr.Column(scale=3, min_width=500):
+            app_session = gr.State({'sts':None,'ctx':None,'img':None})
+            bt_pic = gr.Image(label="Upload an image to start")
+            chat_bot = gr.Chatbot(label=f"Chat with {model_name}")
+            txt_message = gr.Textbox(label="Input text")
+            regenerate.click(
+                regenerate_button_clicked,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            txt_message.submit(
+                respond,
+                [txt_message, chat_bot, app_session, params_form, num_beams, repetition_penalty, repetition_penalty_2, top_p, top_k, temperature],
+                [txt_message, chat_bot, app_session]
+            )
+            bt_pic.upload(lambda: None, None, chat_bot, queue=False).then(upload_img, inputs=[bt_pic,chat_bot,app_session], outputs=[chat_bot,app_session])
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8080, server_name="0.0.0.0")

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_2.6.py ADDED Viewed

	@@ -0,0 +1,557 @@

+#!/usr/bin/env python
+# encoding: utf-8
+import torch
+import argparse
+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+from PIL import Image
+from decord import VideoReader, cpu
+import io
+import os
+import copy
+import requests
+import base64
+import json
+import traceback
+import re
+import modelscope_studio as mgr
+# README, How to run demo on different devices
+# For Nvidia GPUs.
+# python web_demo_2.6.py --device cuda
+# For Mac with MPS (Apple silicon or AMD GPUs).
+# PYTORCH_ENABLE_MPS_FALLBACK=1 python web_demo_2.6.py --device mps
+# Argparser
+parser = argparse.ArgumentParser(description='demo')
+parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
+parser.add_argument('--multi-gpus', action='store_true', default=False, help='use multi-gpus')
+args = parser.parse_args()
+device = args.device
+assert device in ['cuda', 'mps']
+# Load model
+model_path = 'openbmb/MiniCPM-V-2_6'
+if 'int4' in model_path:
+    if device == 'mps':
+        print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
+        exit()
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+else:
+    if args.multi_gpus:
+        from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
+        with init_empty_weights():
+            model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
+        device_map = infer_auto_device_map(model, max_memory={0: "10GB", 1: "10GB"},
+            no_split_module_classes=['SiglipVisionTransformer', 'Qwen2DecoderLayer'])
+        device_id = device_map["llm.model.embed_tokens"]
+        device_map["llm.lm_head"] = device_id # firtt and last layer should be in same device
+        device_map["vpm"] = device_id
+        device_map["resampler"] = device_id
+        device_id2 = device_map["llm.model.layers.26"]
+        device_map["llm.model.layers.8"] = device_id2
+        device_map["llm.model.layers.9"] = device_id2
+        device_map["llm.model.layers.10"] = device_id2
+        device_map["llm.model.layers.11"] = device_id2
+        device_map["llm.model.layers.12"] = device_id2
+        device_map["llm.model.layers.13"] = device_id2
+        device_map["llm.model.layers.14"] = device_id2
+        device_map["llm.model.layers.15"] = device_id2
+        device_map["llm.model.layers.16"] = device_id2
+        #print(device_map)
+        model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
+    else:
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
+        model = model.to(device=device)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
+ERROR_MSG = "Error, please retry"
+model_name = 'MiniCPM-V 2.6'
+MAX_NUM_FRAMES = 64
+IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
+VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
+def get_file_extension(filename):
+    return os.path.splitext(filename)[1].lower()
+def is_image(filename):
+    return get_file_extension(filename) in IMAGE_EXTENSIONS
+def is_video(filename):
+    return get_file_extension(filename) in VIDEO_EXTENSIONS
+form_radio = {
+    'choices': ['Beam Search', 'Sampling'],
+    #'value': 'Beam Search',
+    'value': 'Sampling',
+    'interactive': True,
+    'label': 'Decode Type'
+}
+def create_component(params, comp='Slider'):
+    if comp == 'Slider':
+        return gr.Slider(
+            minimum=params['minimum'],
+            maximum=params['maximum'],
+            value=params['value'],
+            step=params['step'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Radio':
+        return gr.Radio(
+            choices=params['choices'],
+            value=params['value'],
+            interactive=params['interactive'],
+            label=params['label']
+        )
+    elif comp == 'Button':
+        return gr.Button(
+            value=params['value'],
+            interactive=True
+        )
+def create_multimodal_input(upload_image_disabled=False, upload_video_disabled=False):
+    return mgr.MultimodalInput(upload_image_button_props={'label': 'Upload Image', 'disabled': upload_image_disabled, 'file_count': 'multiple'},
+                                        upload_video_button_props={'label': 'Upload Video', 'disabled': upload_video_disabled, 'file_count': 'single'},
+                                        submit_button_props={'label': 'Submit'})
+def chat(img, msgs, ctx, params=None, vision_hidden_states=None):
+    try:
+        print('msgs:', msgs)
+        answer = model.chat(
+            image=None,
+            msgs=msgs,
+            tokenizer=tokenizer,
+            **params
+        )
+        res = re.sub(r'(<box>.*</box>)', '', answer)
+        res = res.replace('<ref>', '')
+        res = res.replace('</ref>', '')
+        res = res.replace('<box>', '')
+        answer = res.replace('</box>', '')
+        print('answer:', answer)
+        return 0, answer, None, None
+    except Exception as e:
+        print(e)
+        traceback.print_exc()
+        return -1, ERROR_MSG, None, None
+def encode_image(image):
+    if not isinstance(image, Image.Image):
+        if hasattr(image, 'path'):
+            image = Image.open(image.path).convert("RGB")
+        else:
+            image = Image.open(image.file.path).convert("RGB")
+    # resize to max_size
+    max_size = 448*16
+    if max(image.size) > max_size:
+        w,h = image.size
+        if w > h:
+            new_w = max_size
+            new_h = int(h * max_size / w)
+        else:
+            new_h = max_size
+            new_w = int(w * max_size / h)
+        image = image.resize((new_w, new_h), resample=Image.BICUBIC)
+    return image
+    ## save by BytesIO and convert to base64
+    #buffered = io.BytesIO()
+    #image.save(buffered, format="png")
+    #im_b64 = base64.b64encode(buffered.getvalue()).decode()
+    #return {"type": "image", "pairs": im_b64}
+def encode_video(video):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    if hasattr(video, 'path'):
+        vr = VideoReader(video.path, ctx=cpu(0))
+    else:
+        vr = VideoReader(video.file.path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx)>MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    video = vr.get_batch(frame_idx).asnumpy()
+    video = [Image.fromarray(v.astype('uint8')) for v in video]
+    video = [encode_image(v) for v in video]
+    print('video frames:', len(video))
+    return video
+def check_mm_type(mm_file):
+    if hasattr(mm_file, 'path'):
+        path = mm_file.path
+    else:
+        path = mm_file.file.path
+    if is_image(path):
+        return "image"
+    if is_video(path):
+        return "video"
+    return None
+def encode_mm_file(mm_file):
+    if check_mm_type(mm_file) == 'image':
+        return [encode_image(mm_file)]
+    if check_mm_type(mm_file) == 'video':
+        return encode_video(mm_file)
+    return None
+def make_text(text):
+    #return {"type": "text", "pairs": text} # # For remote call
+    return text
+def encode_message(_question):
+    files = _question.files
+    question = _question.text
+    pattern = r"\[mm_media\]\d+\[/mm_media\]"
+    matches = re.split(pattern, question)
+    message = []
+    if len(matches) != len(files) + 1:
+        gr.Warning("Number of Images not match the placeholder in text, please refresh the page to restart!")
+    assert len(matches) == len(files) + 1
+    text = matches[0].strip()
+    if text:
+        message.append(make_text(text))
+    for i in range(len(files)):
+        message += encode_mm_file(files[i])
+        text = matches[i + 1].strip()
+        if text:
+            message.append(make_text(text))
+    return message
+def check_has_videos(_question):
+    images_cnt = 0
+    videos_cnt = 0
+    for file in _question.files:
+        if check_mm_type(file) == "image":
+            images_cnt += 1
+        else:
+            videos_cnt += 1
+    return images_cnt, videos_cnt
+def count_video_frames(_context):
+    num_frames = 0
+    for message in _context:
+        for item in message["content"]:
+            #if item["type"] == "image": # For remote call
+            if isinstance(item, Image.Image):
+                num_frames += 1
+    return num_frames
+def respond(_question, _chat_bot, _app_cfg, params_form):
+    _context = _app_cfg['ctx'].copy()
+    _context.append({'role': 'user', 'content': encode_message(_question)})
+    images_cnt = _app_cfg['images_cnt']
+    videos_cnt = _app_cfg['videos_cnt']
+    files_cnts = check_has_videos(_question)
+    if files_cnts[1] + videos_cnt > 1 or (files_cnts[1] + videos_cnt == 1 and files_cnts[0] + images_cnt > 0):
+        gr.Warning("Only supports single video file input right now!")
+        return _question, _chat_bot, _app_cfg
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 2048
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.05,
+            "max_new_tokens": 2048
+        }
+    if files_cnts[1] + videos_cnt > 0:
+        params["max_inp_length"] = 4352 # 4096+256
+        params["use_image_id"] = False
+        params["max_slice_nums"] = 1 if count_video_frames(_context) > 16 else 2
+    code, _answer, _, sts = chat("", _context, None, params)
+    images_cnt += files_cnts[0]
+    videos_cnt += files_cnts[1]
+    _context.append({"role": "assistant", "content": [make_text(_answer)]})
+    _chat_bot.append((_question, _answer))
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    _app_cfg['images_cnt'] = images_cnt
+    _app_cfg['videos_cnt'] = videos_cnt
+    upload_image_disabled = videos_cnt > 0
+    upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+    return create_multimodal_input(upload_image_disabled, upload_video_disabled), _chat_bot, _app_cfg
+def fewshot_add_demonstration(_image, _user_message, _assistant_message, _chat_bot, _app_cfg):
+    ctx = _app_cfg["ctx"]
+    message_item = []
+    if _image is not None:
+        image = Image.open(_image).convert("RGB")
+        ctx.append({"role": "user", "content": [encode_image(image), make_text(_user_message)]})
+        message_item.append({"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]})
+    else:
+        if _user_message:
+            ctx.append({"role": "user", "content": [make_text(_user_message)]})
+            message_item.append({"text": _user_message, "files": []})
+        else:
+            message_item.append(None)
+    if _assistant_message:
+        ctx.append({"role": "assistant", "content": [make_text(_assistant_message)]})
+        message_item.append({"text": _assistant_message, "files": []})
+    else:
+        message_item.append(None)
+    _chat_bot.append(message_item)
+    return None, "", "", _chat_bot, _app_cfg
+def fewshot_respond(_image, _user_message, _chat_bot, _app_cfg, params_form):
+    user_message_contents = []
+    _context = _app_cfg["ctx"].copy()
+    if _image:
+        image = Image.open(_image).convert("RGB")
+        user_message_contents += [encode_image(image)]
+    if _user_message:
+        user_message_contents += [make_text(_user_message)]
+    if user_message_contents:
+        _context.append({"role": "user", "content": user_message_contents})
+    if params_form == 'Beam Search':
+        params = {
+            'sampling': False,
+            'num_beams': 3,
+            'repetition_penalty': 1.2,
+            "max_new_tokens": 2048
+        }
+    else:
+        params = {
+            'sampling': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.7,
+            'repetition_penalty': 1.05,
+            "max_new_tokens": 2048
+        }
+    code, _answer, _, sts = chat("", _context, None, params)
+    _context.append({"role": "assistant", "content": [make_text(_answer)]})
+    if _image:
+        _chat_bot.append([
+            {"text": "[mm_media]1[/mm_media]" + _user_message, "files": [_image]},
+            {"text": _answer, "files": []}
+        ])
+    else:
+        _chat_bot.append([
+            {"text": _user_message, "files": [_image]},
+            {"text": _answer, "files": []}
+        ])
+    if code == 0:
+        _app_cfg['ctx']=_context
+        _app_cfg['sts']=sts
+    return None, '', '', _chat_bot, _app_cfg
+def regenerate_button_clicked(_question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg, params_form):
+    if len(_chat_bot) <= 1 or not _chat_bot[-1][1]:
+        gr.Warning('No question for regeneration.')
+        return '', _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+    if _app_cfg["chat_type"] == "Chat":
+        images_cnt = _app_cfg['images_cnt']
+        videos_cnt = _app_cfg['videos_cnt']
+        _question = _chat_bot[-1][0]
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        files_cnts = check_has_videos(_question)
+        images_cnt -= files_cnts[0]
+        videos_cnt -= files_cnts[1]
+        _app_cfg['images_cnt'] = images_cnt
+        _app_cfg['videos_cnt'] = videos_cnt
+        upload_image_disabled = videos_cnt > 0
+        upload_video_disabled = videos_cnt > 0 or images_cnt > 0
+        _question, _chat_bot, _app_cfg = respond(_question, _chat_bot, _app_cfg, params_form)
+        return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+    else:
+        last_message = _chat_bot[-1][0]
+        last_image = None
+        last_user_message = ''
+        if last_message.text:
+            last_user_message = last_message.text
+        if last_message.files:
+            last_image = last_message.files[0].file.path
+        _chat_bot = _chat_bot[:-1]
+        _app_cfg['ctx'] = _app_cfg['ctx'][:-2]
+        _image, _user_message, _assistant_message, _chat_bot, _app_cfg = fewshot_respond(last_image, last_user_message, _chat_bot, _app_cfg, params_form)
+        return _question, _image, _user_message, _assistant_message, _chat_bot, _app_cfg
+def flushed():
+    return gr.update(interactive=True)
+def clear(txt_message, chat_bot, app_session):
+    txt_message.files.clear()
+    txt_message.text = ''
+    chat_bot = copy.deepcopy(init_conversation)
+    app_session['sts'] = None
+    app_session['ctx'] = []
+    app_session['images_cnt'] = 0
+    app_session['videos_cnt'] = 0
+    return create_multimodal_input(), chat_bot, app_session, None, '', ''
+def select_chat_type(_tab, _app_cfg):
+    _app_cfg["chat_type"] = _tab
+    return _app_cfg
+init_conversation = [
+    [
+        None,
+        {
+            # The first message of bot closes the typewriter.
+            "text": "You can talk to me now",
+            "flushing": False
+        }
+    ],
+]
+css = """
+video { height: auto !important; }
+.example label { font-size: 16px;}
+"""
+introduction = """
+## Features:
+1. Chat with single image
+2. Chat with multiple images
+3. Chat with video
+4. In-context few-shot learning
+Click `How to use` tab to see examples.
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Tab(model_name):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=300):
+                gr.Markdown(value=introduction)
+                params_form = create_component(form_radio, comp='Radio')
+                regenerate = create_component({'value': 'Regenerate'}, comp='Button')
+                clear_button = create_component({'value': 'Clear History'}, comp='Button')
+            with gr.Column(scale=3, min_width=500):
+                app_session = gr.State({'sts':None,'ctx':[], 'images_cnt': 0, 'videos_cnt': 0, 'chat_type': 'Chat'})
+                chat_bot = mgr.Chatbot(label=f"Chat with {model_name}", value=copy.deepcopy(init_conversation), height=600, flushing=False, bubble_full_width=False)
+                with gr.Tab("Chat") as chat_tab:
+                    txt_message = create_multimodal_input()
+                    chat_tab_label = gr.Textbox(value="Chat", interactive=False, visible=False)
+                    txt_message.submit(
+                        respond,
+                        [txt_message, chat_bot, app_session, params_form],
+                        [txt_message, chat_bot, app_session]
+                    )
+                with gr.Tab("Few Shot") as fewshot_tab:
+                    fewshot_tab_label = gr.Textbox(value="Few Shot", interactive=False, visible=False)
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            image_input = gr.Image(type="filepath", sources=["upload"])
+                        with gr.Column(scale=3):
+                            user_message = gr.Textbox(label="User")
+                            assistant_message = gr.Textbox(label="Assistant")
+                            with gr.Row():
+                                add_demonstration_button = gr.Button("Add Example")
+                                generate_button = gr.Button(value="Generate", variant="primary")
+                    add_demonstration_button.click(
+                        fewshot_add_demonstration,
+                        [image_input, user_message, assistant_message, chat_bot, app_session],
+                        [image_input, user_message, assistant_message, chat_bot, app_session]
+                    )
+                    generate_button.click(
+                        fewshot_respond,
+                        [image_input, user_message, chat_bot, app_session, params_form],
+                        [image_input, user_message, assistant_message, chat_bot, app_session]
+                    )
+                chat_tab.select(
+                    select_chat_type,
+                    [chat_tab_label, app_session],
+                    [app_session]
+                )
+                chat_tab.select( # do clear
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+                fewshot_tab.select(
+                    select_chat_type,
+                    [fewshot_tab_label, app_session],
+                    [app_session]
+                )
+                fewshot_tab.select( # do clear
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+                chat_bot.flushed(
+                    flushed,
+                    outputs=[txt_message]
+                )
+                regenerate.click(
+                    regenerate_button_clicked,
+                    [txt_message, image_input, user_message, assistant_message, chat_bot, app_session, params_form],
+                    [txt_message, image_input, user_message, assistant_message, chat_bot, app_session]
+                )
+                clear_button.click(
+                    clear,
+                    [txt_message, chat_bot, app_session],
+                    [txt_message, chat_bot, app_session, image_input, user_message, assistant_message]
+                )
+    with gr.Tab("How to use"):
+        with gr.Column():
+            with gr.Row():
+                image_example = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/m_bear2.gif", label='1. Chat with single or multiple images', interactive=False, width=400, elem_classes="example")
+                example2 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/video2.gif", label='2. Chat with video', interactive=False, width=400, elem_classes="example")
+                example3 = gr.Image(value="http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/fshot.gif", label='3. Few shot', interactive=False, width=400, elem_classes="example")
+# launch
+demo.launch(share=False, debug=True, show_api=False, server_port=8885, server_name="0.0.0.0")

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit-2_5.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+from PIL import Image
+import torch
+from transformers import AutoModel, AutoTokenizer
+# Model path
+model_path = "openbmb/MiniCPM-Llama3-V-2_5"
+# User and assistant names
+U_NAME = "User"
+A_NAME = "Assistant"
+# Set page configuration
+st.set_page_config(
+    page_title="MiniCPM-Llama3-V-2_5 Streamlit",
+    page_icon=":robot:",
+    layout="wide"
+)
+# Load model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    print(f"load_model_and_tokenizer from {model_path}")
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16).to(device="cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
+    st.session_state.model.eval()
+    print("model and tokenizer had loaded completed!")
+# Initialize session state
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+# Sidebar settings
+sidebar_name = st.sidebar.title("MiniCPM-Llama3-V-2_5 Streamlit")
+max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
+repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+# Clear chat history button
+buttonClean = st.sidebar.button("Clear chat history", key="clean")
+if buttonClean:
+    st.session_state.chat_history = []
+    st.session_state.response = ""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    st.rerun()
+# Display chat history
+for i, message in enumerate(st.session_state.chat_history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            if message["image"] is not None:
+                st.image(message["image"], caption='User uploaded image', width=448, use_column_width=False)
+                continue
+            elif message["content"] is not None:
+                st.markdown(message["content"])
+    else:
+        with st.chat_message(name="model", avatar="assistant"):
+            st.markdown(message["content"])
+# Select mode
+selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
+if selected_mode == "Image":
+    # Image mode
+    uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"],
+                                              accept_multiple_files=False)
+    if uploaded_image is not None:
+        st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
+        # Add uploaded image to chat history
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
+# User input box
+user_text = st.chat_input("Enter your question")
+if user_text:
+    with st.chat_message(U_NAME, avatar="user"):
+        st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
+        st.markdown(f"{U_NAME}: {user_text}")
+    # Generate reply using the model
+    model = st.session_state.model
+    tokenizer = st.session_state.tokenizer
+    imagefile = None
+    with st.chat_message(A_NAME, avatar="assistant"):
+        # If the previous message contains an image, pass the image to the model
+        if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
+            uploaded_image = st.session_state.chat_history[-2]["image"]
+            imagefile = Image.open(uploaded_image).convert('RGB')
+        msgs = [{"role": "user", "content": user_text}]
+        res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
+                         sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty,
+                         temperature=temperature, stream=True)
+        # Collect the generated_text str
+        generated_text = st.write_stream(res)
+        st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None})
+    st.divider()

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit-minicpmv2_6.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os.path
+import streamlit as st
+import torch
+from PIL import Image
+from decord import VideoReader, cpu
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+# Model path
+model_path = "openbmb/MiniCPM-V-2_6"
+upload_path = ".\\uploads"
+# User and assistant names
+U_NAME = "User"
+A_NAME = "Assistant"
+# Set page configuration
+st.set_page_config(
+    page_title="MiniCPM-V-2_6 Streamlit",
+    page_icon=":robot:",
+    layout="wide"
+)
+# Load model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    print(f"load_model_and_tokenizer from {model_path}")
+    model = (AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa').
+             to(dtype=torch.bfloat16))
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
+    st.session_state.model.eval().cuda()
+    print("model and tokenizer had loaded completed!")
+# Initialize session state
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+    st.session_state.uploaded_image_list = []
+    st.session_state.uploaded_image_num = 0
+    st.session_state.uploaded_video_list = []
+    st.session_state.uploaded_video_num = 0
+    st.session_state.response = ""
+# Sidebar settings
+sidebar_name = st.sidebar.title("MiniCPM-V-2_6 Streamlit")
+max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
+repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01)
+top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+# Button to clear session history
+buttonClean = st.sidebar.button("Clearing session history", key="clean")
+if buttonClean:
+    # Reset the session state history and uploaded file lists
+    st.session_state.chat_history = []
+    st.session_state.uploaded_image_list = []
+    st.session_state.uploaded_image_num = 0
+    st.session_state.uploaded_video_list = []
+    st.session_state.uploaded_video_num = 0
+    st.session_state.response = ""
+    # If using GPU, clear the CUDA cache to free up memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    # Rerun to refresh the interface
+    st.rerun()
+# Display chat history
+for i, message in enumerate(st.session_state.chat_history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            if message["image"] is not None:
+                st.image(message["image"], caption='User uploaded images', width=512, use_column_width=False)
+                continue
+            elif message["video"] is not None:
+                st.video(message["video"], format="video/mp4", loop=False, autoplay=False, muted=True)
+                continue
+            elif message["content"] is not None:
+                st.markdown(message["content"])
+    else:
+        with st.chat_message(name="model", avatar="assistant"):
+            st.markdown(message["content"])
+# Select mode
+selected_mode = st.sidebar.selectbox("Select Mode", ["Text", "Single Image", "Multiple Images", "Video"])
+# Supported image file extensions
+image_type = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
+if selected_mode == "Single Image":
+    # Single Image Mode
+    uploaded_image = st.sidebar.file_uploader("Upload a Single Image", key=1, type=image_type,
+                                              accept_multiple_files=False)
+    if uploaded_image is not None:
+        st.image(uploaded_image, caption='User Uploaded Image', width=512, use_column_width=False)
+        # Add the uploaded image to the chat history
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image, "video": None})
+        st.session_state.uploaded_image_list = [uploaded_image]
+        st.session_state.uploaded_image_num = 1
+if selected_mode == "Multiple Images":
+    # Multiple Images Mode
+    uploaded_image_list = st.sidebar.file_uploader("Upload Multiple Images", key=2, type=image_type,
+                                                   accept_multiple_files=True)
+    uploaded_image_num = len(uploaded_image_list)
+    if uploaded_image_list is not None and uploaded_image_num > 0:
+        for img in uploaded_image_list:
+            st.image(img, caption='User Uploaded Image', width=512, use_column_width=False)
+            # Add the uploaded images to the chat history
+            st.session_state.chat_history.append({"role": "user", "content": None, "image": img, "video": None})
+        # Update the uploaded image list and count in st.session_state
+        st.session_state.uploaded_image_list = uploaded_image_list
+        st.session_state.uploaded_image_num = uploaded_image_num
+# Supported video format suffixes
+video_type = ['.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v']
+# Tip: You can use the command `streamlit run ./web_demo_streamlit-minicpmv2_6.py --server.maxUploadSize 1024`
+# to adjust the maximum upload size to 1024MB or larger files.
+# The default 200MB limit of Streamlit's file_uploader component might be insufficient for video-based interactions.
+# Adjust the size based on your GPU memory usage.
+if selected_mode == "Video":
+    # 单个视频模态
+    uploaded_video = st.sidebar.file_uploader("Upload a single video file", key=3, type=video_type,
+                                              accept_multiple_files=False)
+    if uploaded_video is not None:
+        st.video(uploaded_video, format="video/mp4", loop=False, autoplay=False, muted=True)
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": None, "video": uploaded_video})
+        uploaded_video_path = os.path.join(upload_path, uploaded_video.name)
+        with open(uploaded_video_path, "wb") as vf:
+            vf.write(uploaded_video.getvalue())
+        st.session_state.uploaded_video_list = [uploaded_video_path]
+        st.session_state.uploaded_video_num = 1
+MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
+# Encodes a video by sampling frames at a fixed rate and converting them to image arrays.
+def encode_video(video_path):
+    def uniform_sample(frame_indices, num_samples):
+        # Calculate sampling interval and uniformly sample frame indices
+        gap = len(frame_indices) / num_samples
+        sampled_idxs = np.linspace(gap / 2, len(frame_indices) - gap / 2, num_samples, dtype=int)
+        return [frame_indices[i] for i in sampled_idxs]
+    # Read the video and set the decoder's context to CPU
+    vr = VideoReader(video_path, ctx=cpu(0))
+    # Calculate the sampling interval to sample video frames at 1 FPS
+    sample_fps = round(vr.get_avg_fps() / 1)  # Use integer FPS
+    frame_idx = list(range(0, len(vr), sample_fps))
+    # If the number of sampled frames exceeds the maximum limit, uniformly sample them
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    # Retrieve the sampled frames and convert them to image arrays
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(frame.astype('uint8')) for frame in frames]
+    print('Number of frames:', len(frames))
+    return frames
+# User input box
+user_text = st.chat_input("Enter your question")
+if user_text is not None:
+    if user_text.strip() is "":
+        st.warning('Input message could not be empty!', icon="⚠️")
+    else:
+        # Display user input and save it to session history
+        with st.chat_message(U_NAME, avatar="user"):
+            st.session_state.chat_history.append({
+                "role": "user",
+                "content": user_text,
+                "image": None,
+                "video": None
+            })
+            st.markdown(f"{U_NAME}: {user_text}")
+        # Generate responses using the model
+        model = st.session_state.model
+        tokenizer = st.session_state.tokenizer
+        content_list = []  # Store the content (text or image) that will be passed into the model
+        imageFile = None
+        with st.chat_message(A_NAME, avatar="assistant"):
+            # Handle different inputs depending on the mode selected by the user
+            if selected_mode == "Single Image":
+                # Single image mode: pass in the last uploaded image
+                print("Single Images mode in use")
+                if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1:
+                    uploaded_image = st.session_state.uploaded_image_list[-1]
+                    if uploaded_image:
+                        imageFile = Image.open(uploaded_image).convert('RGB')
+                        content_list.append(imageFile)
+                else:
+                    print("Single Images mode: No image found")
+            elif selected_mode == "Multiple Images":
+                # Multi-image mode: pass in all the images uploaded last time
+                print("Multiple Images mode in use")
+                if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1:
+                    for uploaded_image in st.session_state.uploaded_image_list:
+                        imageFile = Image.open(uploaded_image).convert('RGB')
+                        content_list.append(imageFile)
+                else:
+                    print("Multiple Images mode: No image found")
+            elif selected_mode == "Video":
+                # Video mode: pass in slice frames of uploaded video
+                print("Video mode in use")
+                if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1:
+                    uploaded_video_path = st.session_state.uploaded_video_list[-1]
+                    if uploaded_video_path:
+                        with st.spinner('Encoding your video, please wait...'):
+                            frames = encode_video(uploaded_video_path)
+                else:
+                    print("Video Mode: No video found")
+            # Defining model parameters
+            params = {
+                'sampling': True,
+                'top_p': top_p,
+                'top_k': top_k,
+                'temperature': temperature,
+                'repetition_penalty': repetition_penalty,
+                "max_new_tokens": max_length,
+                "stream": True
+            }
+            # Set different input parameters depending on whether to upload a video
+            if st.session_state.uploaded_video_num == 1 and selected_mode == "Video":
+                msgs = [{"role": "user", "content": frames + [user_text]}]
+                # Set decode params for video
+                params["max_inp_length"] = 4352  # Set the maximum input length of the video mode
+                params["use_image_id"] = False  # Do not use image_id
+                params["max_slice_nums"] = 1  # # use 1 if cuda OOM and video resolution >  448*448
+            else:
+                content_list.append(user_text)
+                msgs = [{"role": "user", "content": content_list}]
+            print("content_list:", content_list)  # debug
+            print("params:", params)  # debug
+            # Generate and display the model's responses
+            with st.spinner('AI is thinking...'):
+                response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params)
+            st.session_state.response = st.write_stream(response)
+            st.session_state.chat_history.append({
+                "role": "model",
+                "content": st.session_state.response,
+                "image": None,
+                "video": None
+            })
+        st.divider()  # Add separators to the interface

r1-a/response_generation/minicpm/MiniCPM-o/web_demos/web_demo_streamlit.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+from PIL import Image
+import torch
+from transformers import AutoModel, AutoTokenizer
+# Model path
+model_path = "openbmb/MiniCPM-V-2"
+# User and assistant names
+U_NAME = "User"
+A_NAME = "Assistant"
+# Set page configuration
+st.set_page_config(
+    page_title="Minicpm-V-2 Streamlit",
+    page_icon=":robot:",
+    layout="wide"
+)
+# Load model and tokenizer
+@st.cache_resource
+def load_model_and_tokenizer():
+    print(f"load_model_and_tokenizer from {model_path}")
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(
+        device="cuda:0", dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    return model, tokenizer
+# Initialize session state
+if 'model' not in st.session_state:
+    st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer()
+    print("model and tokenizer had loaded completed!")
+# Initialize session state
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+# Sidebar settings
+sidebar_name = st.sidebar.title("Minicpm-V-2 Streamlit")
+max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2)
+top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01)
+temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01)
+# Clear chat history button
+buttonClean = st.sidebar.button("Clear chat history", key="clean")
+if buttonClean:
+    st.session_state.chat_history = []
+    st.session_state.response = ""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    st.rerun()
+# Display chat history
+for i, message in enumerate(st.session_state.chat_history):
+    if message["role"] == "user":
+        with st.chat_message(name="user", avatar="user"):
+            if message["image"] is not None:
+                st.image(message["image"], caption='User uploaded image', width=468, use_column_width=False)
+                continue
+            elif message["content"] is not None:
+                st.markdown(message["content"])
+    else:
+        with st.chat_message(name="model", avatar="assistant"):
+            st.markdown(message["content"])
+# Select mode
+selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"])
+if selected_mode == "Image":
+    # Image mode
+    uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], accept_multiple_files=False)
+    if uploaded_image is not None:
+        st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False)
+        # Add uploaded image to chat history
+        st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image})
+# User input box
+user_text = st.chat_input("Enter your question")
+if user_text:
+    with st.chat_message(U_NAME, avatar="user"):
+        st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None})
+        st.markdown(f"{U_NAME}: {user_text}")
+    # Generate reply using the model
+    model = st.session_state.model
+    tokenizer = st.session_state.tokenizer
+    with st.chat_message(A_NAME, avatar="assistant"):
+        # If the previous message contains an image, pass the image to the model
+        if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None:
+            uploaded_image = st.session_state.chat_history[-2]["image"]
+            imagefile = Image.open(uploaded_image).convert('RGB')
+        msgs = [{"role": "user", "content": user_text}]
+        res, context, _ = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer,
+                                     sampling=True,top_p=top_p,temperature=temperature)
+        st.markdown(f"{A_NAME}: {res}")
+        st.session_state.chat_history.append({"role": "model", "content": res, "image": None})
+    st.divider()