clean unnecessary files

Files changed (7) hide show

.gitattributes +1 -0
README.md +9 -48
assets/example.mp4 +0 -3
eval.py +0 -257
ref_results/output_w_sub.json +0 -0
ref_results/output_wo_sub.json +0 -0
videoccam.py +0 -312

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/example.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -14,62 +14,23 @@ torch==2.1.0
 torchvision==0.16.0
 transformers==4.40.2
 peft==0.10.0
-pyarrow==13.0.0     # load parquet
-decord==0.6.0       # load video
-pysubs2==1.7.2      # load subtitle
 ```
-### Sample Inference Code
-```
-import torch
-from eval import load_video
-from videoccam import VideoCCAM
-video_path = 'assets/example.mp4'
-question = 'Can you please describe what happens in the video in detail?'
-sample_config = dict(
-    sample_type='uniform',
-    num_frames=32
-)
-mllm = VideoCCAM(
-    model_path='.',
-    chat_template='<|user|>\n{input}<|end|>\n<|assistant|>\n',
-    generation_args=dict(
-        stop_tokens=['<|end|>', '<|endoftext|>'],
-        max_new_tokens=512,
-        do_sample=False,
-        num_beams=5,
-    ),
-    llm_name_or_path='microsoft/Phi-3-mini-4k-instruct',    # you can replace this with local directory if the model has been downloaded before
-    visual_encoder_name_or_path='google/siglip-so400m-patch14-384',     # you can replace this with local directory if the model has been downloaded before
-    special_tokens=['<time>', '</time>'],
-    visual_select_layer=-2,
-    torch_dtype=torch.bfloat16,
-    device_map='cuda:0'
-)
-frames, = load_video(video_path, **sample_config)
-response = mllm.generate(texts=[question], videos=[frames])[0]
-print(response)
-```
-### Video-MME Evaluation
-You are expected to reproduce the results of 48.2 (without subtitle) and 51.7 (with subtitle) by running the following command. By default, the results are saved as `output_w_sub.json` and `output_wo_sub.json` in local directory. We provide our results in `ref_results` directory.
-```
-python eval.py
-```
 ## Acknowledgement
-* [xtuner](https://github.com/InternLM/xtuner): Video-CCAM-4B is trained using the xtuner framework. Thanks for their excellent works!
-* [Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct): Great small language models developed by Microsoft.
 * [SigLIP SO400M](https://huggingface.co/google/siglip-so400m-patch14-384): Outstanding vision encoder developed by Google.
 ## License

 torchvision==0.16.0
 transformers==4.40.2
 peft==0.10.0
 ```
+## Inference
+Please refer to [Video-CCAM](https://github.com/QQ-MM/Video-CCAM) on inference and evaluation.
+### Video-MME
+|#Frames.|32|96|
+|:-:|:-:|:-:|
+|w/o subs|48.2|49.6|
+|w subs|51.7|53.0|
 ## Acknowledgement
+* [xtuner](https://github.com/InternLM/xtuner): Video-CCAM-9B is trained using the xtuner framework. Thanks for their excellent works!
+* [Phi-3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct): Powerful language models developed by Microsoft.
 * [SigLIP SO400M](https://huggingface.co/google/siglip-so400m-patch14-384): Outstanding vision encoder developed by Google.
 ## License

assets/example.mp4 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c9ce295c4c154bdbc266c2333b18710796ff1d151623447664730aae25a461c
-size 3283880

eval.py DELETED Viewed

@@ -1,257 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-================================================
-@author: Jaron
-@time: 2024/06/23 12:59:38
-@email: fjjth98@163.com
-@description: Evaluate MLLM on Video-MME Benchmark
-================================================
-"""
-import json
-import torch
-import pysubs2
-import os.path as osp
-from PIL import Image
-from tqdm import tqdm
-from typing import Any
-from copy import deepcopy
-from pandas import read_parquet
-from decord import VideoReader, cpu
-from torch.utils.data import Dataset, DataLoader, default_collate
-def video_collate_fn(batch: Any) -> Any:
-    """this collate function address dict video inputs, support to process variable number of frames for different inputs
-    Args:
-        batch (_type_): _description_
-    Returns:
-        _type_: _description_
-    """
-    if isinstance(batch[0], dict) and 'video' in batch[0]:
-        video = [b.pop('video') for b in batch]
-        batch = default_collate(batch)
-        batch['video'] = video
-    else:
-        batch = default_collate(batch)
-    return batch
-def uniform_indices(num_frames: int, total_frames: int) -> list[int]:
-    """Get uniform indices
-    Args:
-        num_frames (int): number of frames
-        total_frames (int): total number of frames
-    Returns:
-        list[int]: Output frame indices
-    """
-    if num_frames < total_frames:
-        splits = torch.linspace(0, total_frames, num_frames+1, dtype=int)
-        indices = ((splits[:-1] + splits[1:]) // 2).tolist()
-    else:
-        indices = list(range(total_frames))
-    return indices
-def fps_indices(input_fps: float, total_frames: int, output_fps: float = None, max_num_frames: int = -1) -> list[int]:
-    """Get indices according to the output_fps
-    Args:
-        input_fps (float): input fps
-        total_frames (int): total number of frames
-        output_fps (float, optional): output fps. Defaults to None, means output_fps==input_fps.
-        max_num_frames (int, optional): max number of frames. Defaults to -1, means no limitation.
-    Returns:
-        list[int]: Output frame indices
-    """
-    delta = 1 if output_fps is None else input_fps / output_fps
-    indices = torch.arange(0, total_frames, delta).round().to(int)
-    indices = [e for e in indices if e < total_frames]
-    if 0 < max_num_frames < len(indices):
-        indices = indices[:max_num_frames]
-    return indices
-def load_video(src_path: str, sample_type: str, sub_path: str = None, **kwargs) -> list[Image.Image]:# | tuple[list[Image.Image], str]:
-    """Load video using decord, optionally load subtitles
-    Args:
-        src_path (str): video path
-        sample_type (str): 'uniform' or 'fps'
-        sub_path (str): subtitle path, .srt
-        kwargs: for 'uniform', require 'num_frames'; for 'fps', optionally require 'output_fps' and 'max_num_frames'
-    Returns:
-        list[Image.Image]: frame list
-    """
-    vr = VideoReader(src_path, ctx=cpu(0), num_threads=1)
-    total_frames = len(vr)
-    if sample_type == 'uniform':
-        num_frames = kwargs.pop('num_frames')
-        indices = uniform_indices(num_frames, total_frames)
-    elif sample_type == 'fps':
-        input_fps = float(vr.get_avg_fps())
-        output_fps = kwargs.pop('output_fps', None)
-        max_num_frames = kwargs.pop('max_num_frames', -1)
-        indices = fps_indices(input_fps, total_frames, output_fps, max_num_frames)
-    else:
-        raise ValueError(f'Do not support {sample_type} sample type')
-    frames = vr.get_batch(indices).asnumpy()        # (T, H, W, C), np.uint8
-    frames = [Image.fromarray(frame) for frame in frames]
-    if sub_path is None:
-        return frames
-    elif osp.exists(sub_path):
-        subs = pysubs2.load(sub_path, encoding='utf-8')
-        subtitles = []
-        for idx in indices:
-            sub_text = []
-            cur_time = pysubs2.make_time(fps=float(vr.get_avg_fps()), frames=idx)
-            for sub in subs:
-                if sub.end < cur_time:
-                    continue
-                elif sub.start < cur_time:
-                    sub_text.append(sub.text.replace('\\N', ' '))
-                    break   # in accordance to the official benchmark
-                else:
-                    break
-            sub_text = ' '.join(sub_text)
-            if sub_text.strip():
-                subtitles.append(sub_text)
-        subtitles = '\n'.join(subtitles)
-        return frames, subtitles
-    else:
-        return frames, ''
-class VideoMMEDataset(Dataset):
-    def __init__(self, dataset_path: str, sample_config: dict, use_subtitle: bool = False):
-        super().__init__()
-        self.dataset_path = dataset_path
-        self.sample_config = sample_config
-        self.use_subtitle = use_subtitle
-        data_dict = {}
-        index_keys = ['video_id', 'duration', 'domain', 'sub_category', 'videoID']
-        value_keys = ['question_id', 'task_type', 'question', 'options', 'answer']
-        df = read_parquet(osp.join(dataset_path, 'videomme', 'test-00000-of-00001.parquet'))
-        df['options'] = df['options'].apply(list)
-        for _, data in df.iterrows():
-            key = tuple(data[k] for k in index_keys)
-            value = data[value_keys].to_dict()
-            if key in data_dict:
-                data_dict[key].append(value)
-            else:
-                data_dict[key] = [value]
-        self.data_list = [dict(zip(index_keys + ['questions'], list(k) + [v])) for k, v in data_dict.items()]
-    def __len__(self):
-        return len(self.data_list)
-    def __getitem__(self, idx) -> dict:
-        if self.use_subtitle:
-            frames, subtitles = load_video(
-                src_path=osp.join(self.dataset_path, 'video', self.data_list[idx]['videoID'] + '.mp4'),
-                sub_path=osp.join(self.dataset_path, 'subtitle', self.data_list[idx]['videoID'] + '.srt'),
-                **self.sample_config
-            )
-            text = ['\n'.join([
-                "This video's subtitles are listed below:",
-                subtitles,
-                'Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.',
-                i['question']
-            ] + i['options']) for i in self.data_list[idx]['questions']]
-        else:
-            frames = load_video(
-                src_path=osp.join(self.dataset_path, 'video', self.data_list[idx]['videoID'] + '.mp4'),
-                **self.sample_config
-            )
-            text = ['\n'.join([
-                'Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.',
-                i['question']
-            ] + i['options']) for i in self.data_list[idx]['questions']]
-            subtitles = ''
-        return dict(
-            video=frames,
-            text=text
-        )
-if __name__ == '__main__':
-    from videoccam import VideoCCAM, DEFAULT_VIDEO_TOKEN
-    mllm = VideoCCAM(
-        model_path='.',
-        chat_template='<|user|>\n{input}<|end|>\n<|assistant|>\n',
-        generation_args=dict(
-            stop_tokens=['<|end|>', '<|endoftext|>'],
-            max_new_tokens=512,
-            do_sample=False
-        ),
-        llm_name_or_path='microsoft/Phi-3-mini-4k-instruct',
-        visual_encoder_name_or_path='google/siglip-so400m-patch14-384',
-        special_tokens=['<time>', '</time>'],
-        visual_select_layer=-2,
-        torch_dtype=torch.bfloat16,
-        device_map='cuda:0'
-    )
-    mllm.eval()
-    dataset = VideoMMEDataset(
-        dataset_path='',
-        sample_config=dict(
-            sample_type='uniform',
-            num_frames=32
-        )
-    )
-    with torch.inference_mode():
-        for use_subtitle in (True,):
-            dataset.use_subtitle = use_subtitle
-            dataloader = DataLoader(
-                dataset,
-                batch_size=4,
-                num_workers=8,
-                shuffle=False,
-                pin_memory=True,
-                collate_fn=video_collate_fn
-            )
-            results = []
-            for data in tqdm(dataloader):
-                response, pixel_values = mllm.generate(
-                    texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][0]],
-                    videos=data['video'],
-                    return_pixel_values=True
-                )
-                response = [response]
-                for i in range(1, len(data['text'])):
-                    response.append(mllm.generate(
-                        texts=['\n'.join([DEFAULT_VIDEO_TOKEN, t]) for t in data['text'][i]],
-                        pixel_values=pixel_values
-                    ))
-                response = [[response[i][j] for i in range(len(response))] for j in range(len(response[0]))]
-                results.extend(response)
-            outputs = []
-            for data, responses in zip(dataset.data_list, results):
-                data = deepcopy(data)
-                data.pop('videoID')
-                for question, response in zip(data['questions'], responses):
-                    question['response'] = response
-                outputs.append(data)
-            suffix = 'w_sub' if use_subtitle else 'wo_sub'
-            with open(f'output_{suffix}.json', 'w') as f:
-                json.dump(outputs, f, indent=4, ensure_ascii=False)

ref_results/output_w_sub.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ref_results/output_wo_sub.json DELETED Viewed

The diff for this file is too large to render. See raw diff

videoccam.py DELETED Viewed

@@ -1,312 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-================================================
-@author: Jaron
-@time: 2024/06/23 09:52:24
-@email: fjjth98@163.com
-@description:
-================================================
-"""
-import torch
-import os.path as osp
-import torch.nn as nn
-from PIL import Image
-from peft import PeftModel
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, SiglipVisionModel, SiglipImageProcessor
-IGNORE_INDEX = -100
-IMAGE_TOKEN_INDEX = -200
-DEFAULT_IMAGE_TOKEN = '<image>'
-DEFAULT_VIDEO_TOKEN = '<video>'
-class VideoCCAM(nn.Module):
-    def __init__(
-        self,
-        model_path: str,
-        chat_template: str,
-        generation_args: dict,
-        llm_name_or_path: str = None,
-        visual_encoder_name_or_path: str = None,
-        special_tokens: list[str] = None,
-        visual_select_layer: int = -2,
-        torch_dtype: torch.dtype = torch.float16,
-        device_map: str = 'cuda:0'
-    ):
-        super().__init__()
-        self.chat_template = chat_template
-        self.generation_args = generation_args
-        self.visual_select_layer = visual_select_layer
-        self.torch_dtype = torch_dtype
-        self.device_map = device_map
-        if llm_name_or_path is None:
-            llm_name_or_path = model_path
-        if visual_encoder_name_or_path is None:
-            visual_encoder_name_or_path = osp.join(model_path, 'visual_encoder')
-            assert osp.exists(visual_encoder_name_or_path), f'{visual_encoder_name_or_path} does not exist, you have to specify `visual_encoder_name_or_path`'
-        projector_path = osp.join(model_path, 'projector')
-        assert osp.exists(projector_path), f'{projector_path} does not exist, you have to change `model_path`'
-        self.llm = AutoModelForCausalLM.from_pretrained(
-            llm_name_or_path,
-            trust_remote_code=True,
-            torch_dtype=torch_dtype,
-            device_map=device_map
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            llm_name_or_path,
-            trust_remote_code=True
-        )
-        print(f'Load LLM from {llm_name_or_path}')
-        if special_tokens is not None:
-            self.llm.resize_token_embeddings(self.llm.get_input_embeddings().weight.size(0) + len(special_tokens))
-            self.llm.requires_grad_(False)
-            self.llm.get_input_embeddings().weight[-len(special_tokens):].zero_()
-            self.tokenizer.add_tokens(special_tokens, special_tokens=True)
-            print(f'Add special_tokens {special_tokens} to LLM and tokenizer')
-        if osp.exists(adapter_path := osp.join(model_path, 'llm_adapter')):
-            self.llm = PeftModel.from_pretrained(self.llm, adapter_path)
-            print(f'Load LLM adapter from {adapter_path}')
-        self.generation_args['eos_token_id'] = self.tokenizer.convert_tokens_to_ids(self.generation_args.pop('stop_tokens'))
-        self.visual_encoder = SiglipVisionModel.from_pretrained(
-            visual_encoder_name_or_path,
-            torch_dtype=torch_dtype,
-            device_map=device_map
-        )
-        self.image_processor = SiglipImageProcessor.from_pretrained(visual_encoder_name_or_path)
-        print(f'Load SigLIP visual encoder from {visual_encoder_name_or_path}')
-        if osp.exists(adapter_path := osp.join(model_path, 'visual_encoder_adapter')):
-            self.visual_encoder = PeftModel.from_pretrained(self.visual_encoder, adapter_path)
-            print(f'Load visual_encoder adapter from {adapter_path}')
-        self.projector = AutoModel.from_pretrained(
-            projector_path,
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            trust_remote_code=True
-        )
-        print(f'Load projector from {projector_path}')
-    # Modified from https://github.com/InternLM/xtuner/blob/main/xtuner/model/utils.py#L138
-    def prepare_inputs_labels_for_multimodal(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor = None,
-        attention_mask: torch.Tensor = None,
-        past_key_values: list[torch.FloatTensor] = None,
-        labels: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None
-    ):
-        if pixel_values is None:
-            return {
-                'input_ids': input_ids,
-                'position_ids': position_ids,
-                'attention_mask': attention_mask,
-                'past_key_values': past_key_values,
-                'inputs_embeds': None,
-                'labels': labels
-            }
-        _labels = labels
-        _position_ids = position_ids
-        _attention_mask = attention_mask
-        if attention_mask is None:
-            if isinstance(input_ids, torch.Tensor):
-                attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
-            elif isinstance(input_ids, list):
-                attention_mask = [torch.ones_like(i, dtype=torch.bool) for i in input_ids]
-                _attention_mask = attention_mask
-            else:
-                raise ValueError(f'Do not support {type(input_ids)} type as input_ids')
-        else:
-            attention_mask = attention_mask.bool()
-        if position_ids is None:
-            position_ids = torch.arange(
-                0, input_ids[0].shape[0], dtype=torch.long, device=input_ids[0].device)
-        if labels is None:
-            if isinstance(input_ids, torch.Tensor):
-                labels = torch.full_like(input_ids, IGNORE_INDEX)
-            elif isinstance(input_ids, list):
-                labels = [torch.full_like(i, IGNORE_INDEX) for i in input_ids]
-            else:
-                raise ValueError(f'Do not support {type(input_ids)} type as input_ids')
-        # remove the padding using attention_mask -- TODO: double check
-        input_ids = [
-            cur_input_ids[cur_attention_mask]
-            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
-        ]
-        labels = [
-            cur_labels[cur_attention_mask]
-            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
-        ]
-        new_inputs_embeds = []
-        new_labels = []
-        cur_image_idx = 0
-        for batch_idx, cur_input_ids in enumerate(input_ids):
-            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
-            if num_images == 0:
-                cur_pixel_values = pixel_values[cur_image_idx]
-                cur_inputs_embeds_1 = self.llm.get_input_embeddings()(cur_input_ids)
-                cur_inputs_embeds = torch.cat(
-                    [cur_inputs_embeds_1, cur_pixel_values[0:0]], dim=0)
-                new_inputs_embeds.append(cur_inputs_embeds)
-                new_labels.append(labels[batch_idx])
-                cur_image_idx += 1
-                continue
-            image_token_indices = [-1] + torch.where(
-                cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [
-                    cur_input_ids.shape[0]
-                ]
-            cur_input_ids_noim = []
-            cur_labels = labels[batch_idx]
-            cur_labels_noim = []
-            for i in range(len(image_token_indices) - 1):
-                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] +
-                                                        1:image_token_indices[i +
-                                                                            1]])
-                cur_labels_noim.append(cur_labels[image_token_indices[i] +
-                                                1:image_token_indices[i + 1]])
-            split_sizes = [x.shape[0] for x in cur_labels_noim]
-            cur_inputs_embeds = self.llm.get_input_embeddings()(
-                torch.cat(cur_input_ids_noim))
-            cur_inputs_embeds_no_im = torch.split(
-                cur_inputs_embeds, split_sizes, dim=0)
-            cur_new_inputs_embeds = []
-            cur_new_labels = []
-            for i in range(num_images + 1):
-                cur_new_inputs_embeds.append(cur_inputs_embeds_no_im[i])
-                cur_new_labels.append(cur_labels_noim[i])
-                if i < num_images:
-                    cur_pixel_values = pixel_values[cur_image_idx]
-                    cur_image_idx += 1
-                    cur_new_inputs_embeds.append(cur_pixel_values)
-                    cur_new_labels.append(
-                        torch.full((cur_pixel_values.shape[0], ),
-                                IGNORE_INDEX,
-                                device=cur_labels.device,
-                                dtype=cur_labels.dtype))
-            cur_new_inputs_embeds = torch.cat(cur_new_inputs_embeds)
-            cur_new_labels = torch.cat(cur_new_labels)
-            new_inputs_embeds.append(cur_new_inputs_embeds)
-            new_labels.append(cur_new_labels)
-        # Combine them
-        max_len = max(x.shape[0] for x in new_inputs_embeds)
-        batch_size = len(new_inputs_embeds)
-        new_inputs_embeds_padded = []
-        new_labels_padded = torch.full((batch_size, max_len),
-                                    IGNORE_INDEX,
-                                    dtype=new_labels[0].dtype,
-                                    device=new_labels[0].device)
-        attention_mask = torch.zeros((batch_size, max_len),
-                                    dtype=attention_mask[0].dtype,
-                                    device=attention_mask[0].device)
-        position_ids = torch.zeros((batch_size, max_len),
-                                dtype=position_ids.dtype,
-                                device=position_ids.device)
-        for i, (cur_new_embed,
-                cur_new_labels) in enumerate(zip(new_inputs_embeds, new_labels)):
-            cur_len = cur_new_embed.shape[0]
-            new_inputs_embeds_padded.append(
-                torch.cat((cur_new_embed,
-                        torch.zeros((max_len - cur_len, cur_new_embed.shape[1]),
-                                    dtype=cur_new_embed.dtype,
-                                    device=cur_new_embed.device)),
-                        dim=0))
-            if cur_len > 0:
-                new_labels_padded[i, :cur_len] = cur_new_labels
-                attention_mask[i, :cur_len] = True
-                position_ids[i, :cur_len] = torch.arange(
-                    0,
-                    cur_len,
-                    dtype=position_ids.dtype,
-                    device=position_ids.device)
-        new_inputs_embeds = torch.stack(new_inputs_embeds_padded, dim=0)
-        if _labels is None:
-            new_labels = None
-        else:
-            new_labels = new_labels_padded
-        if _attention_mask is None:
-            attention_mask = None
-        elif isinstance(_attention_mask, list):
-            attention_mask = attention_mask.to(dtype=_attention_mask[0].dtype)
-        else:
-            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
-        if _position_ids is None:
-            position_ids = None
-        return {
-            'input_ids': None,
-            'position_ids': position_ids,
-            'attention_mask': attention_mask,
-            'past_key_values': past_key_values,
-            'inputs_embeds': new_inputs_embeds,
-            'labels': new_labels
-        }
-    def generate(
-        self,
-        texts: list[str],
-        videos: list[list[Image.Image]] = None,
-        pixel_values: torch.Tensor = None,
-        return_pixel_values: bool = False
-    ) -> list[str] | tuple[list[str], torch.Tensor]:
-        """Genrate respoonse for video and text inputs.
-        Args:
-            text (list[str]): list of text inputs
-            video (list[list[Image.Image]], optional): list of frame list. Defaults to None.
-            pixel_values (torch.Tensor, optional): precomputed pixel_values. Defaults to None.
-            return_pixel_values (bool, optional): whether return pixel values or not. Defaults to False.
-        Returns:
-            list[str]: _description_
-        """
-        prediction = []
-        # Get visual embeddings
-        if pixel_values is None:
-            frames, split_sizes = [], []
-            for i in videos:
-                frames += i
-                split_sizes.append(len(i))
-            pixel_values = self.image_processor(frames, return_tensors='pt')['pixel_values'].to(self.torch_dtype).to(self.device_map)
-            pixel_values = self.visual_encoder(pixel_values, output_hidden_states=True).hidden_states[self.visual_select_layer]
-            pixel_values = self.projector(pixel_values, split_sizes)
-        for i, t in enumerate(texts):
-            et = self.chat_template.format(input=t).replace(DEFAULT_VIDEO_TOKEN, DEFAULT_IMAGE_TOKEN).split(DEFAULT_IMAGE_TOKEN)
-            assert len(et) == 2, f'Wrong input formats for {t}'
-            input_ids = [torch.tensor(self.tokenizer.encode(et[0]) + [IMAGE_TOKEN_INDEX] + self.tokenizer.encode(et[1], add_special_tokens=False), device=self.device_map)]
-            mm_inputs = self.prepare_inputs_labels_for_multimodal(
-                input_ids=input_ids,
-                pixel_values=pixel_values[i:i+1]
-            )
-            generate_output = self.llm.generate(
-                **mm_inputs,
-                **self.generation_args
-            )[0]
-            prediction.append(self.tokenizer.decode(generate_output, skip_special_tokens=True))
-        if return_pixel_values:
-            return prediction, pixel_values
-        else:
-            return prediction