yfan07 commited on Apr 19

Commit

f1106d1

verified ·

1 Parent(s): 51e0ad4

Add files using upload-large-folder tool

Browse files

Files changed (47) hide show

.gitattributes +0 -0
ChatUniVi/__init__.py +1 -0
ChatUniVi/config/__init__.py +17 -0
ChatUniVi/config/dataset_config.py +52 -0
ChatUniVi/config/model_config.py +24 -0
ChatUniVi/constants.py +19 -0
ChatUniVi/conversation.py +273 -0
ChatUniVi/demo.py +117 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_3_context.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal.py +198 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency.py +206 -0
ChatUniVi/eval/evaluate/evaluate_gpt_review_visual.py +118 -0
ChatUniVi/eval/evaluate/evaluate_science_qa.py +142 -0
ChatUniVi/eval/evaluate/evaluate_video_qa.py +212 -0
ChatUniVi/eval/evaluate/summarize_gpt_review.py +67 -0
ChatUniVi/eval/model_coco_vqa.py +220 -0
ChatUniVi/eval/model_video_consistency.py +252 -0
ChatUniVi/eval/model_video_general.py +213 -0
ChatUniVi/eval/model_video_qa.py +232 -0
ChatUniVi/eval/model_vqa.py +125 -0
ChatUniVi/eval/model_vqa_scienceqa.py +164 -0
ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_gpt4_answer.jsonl +90 -0
ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_questions.jsonl +90 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_adversarial.jsonl +0 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_popular.jsonl +0 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_random.jsonl +0 -0
ChatUniVi/eval/questions/scienceqa/pid_splits.json +0 -0
ChatUniVi/eval/questions/scienceqa/test_QCM-LEA.json +0 -0
ChatUniVi/eval/questions/video_qa/activitynet_a_list.json +2105 -0
ChatUniVi/eval/questions/video_qa/activitynet_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/consistency_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/generic_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/msrvtt_a_list.json +4175 -0
ChatUniVi/eval/questions/video_qa/msrvtt_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/msvd_a_list.json +1202 -0
ChatUniVi/mm_utils.py +73 -0
ChatUniVi/utils.py +126 -0
README.md +88 -0
SEG_LTPO_results.md +348 -0
load_model.py +877 -0
requirements.txt +76 -0
save_audio_feats.py +83 -0
save_sam_feats.py +96 -0
seg_ltpo.py +786 -0
train.py +534 -0

.gitattributes CHANGED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import ChatUniViLlamaForCausalLM

ChatUniVi/config/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .dataset_config import *
+from .model_config import *
+ModelConfig = {
+    "PRETUNE": model_config_pretune,
+    "FINETUNE": model_config_finetune,
+}
+DataConfig = {
+    "Pretrain": [Pretrain, COCO_CAP, COCO_REG, COCO_REC],
+    "SQA": [SQA],
+    "FINETUNE": [VIT, MIMIC_imageonly, VIDEO],
+    "Pretrainv1.5": [Pretrain, Pretrain_valley_llava],
+    "FINETUNEv1.5": [VIT, VIDEO, LLaVA],
+}

ChatUniVi/config/dataset_config.py ADDED Viewed

	@@ -0,0 +1,52 @@

+Pretrain = {
+    "chat_path": "${PATH}/CC3M-595K/chat.json",
+    "CC3M": "${PATH}/CC3M-595K",
+}
+VIT = {
+    "chat_path": "${PATH}/llava_instruct_150k.json",
+    "COCO2017": "${PATH}/COCO2017/train2017",
+}
+MIMIC_imageonly = {
+    "chat_path": "${PATH}/MIMIC-IT-imageonly.json",
+    "CDG": "${PATH}/CGD/images",
+    "LA": "${PATH}/LA/images",
+    "SD": "${PATH}/SD/images",
+}
+COCO_CAP = {
+    "chat_path": "${PATH}/COCO/coco_cap_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+COCO_REG = {
+    "chat_path": "${PATH}/COCO/coco_reg_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+COCO_REC = {
+    "chat_path": "${PATH}/COCO/coco_rec_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+VIDEO = {
+    "chat_path": "${PATH}/video_chat.json",
+    "VIDEO": "${PATH}/Activity_Videos",
+}
+SQA = {
+    "chat_path": "${PATH}/llava_train_QCM-LEA.json",
+    "ScienceQA": "${PATH}/scienceqa/train",
+}
+Pretrain_valley_llava = {
+    "chat_path": "${PATH}/valley_llavaimage.json",
+    "valley": "${PATH}/Data",
+    "llava": "${PATH}/Data",  # from llava v1.5
+}
+LLaVA = {
+    "chat_path": "${PATH}/llavaimage_tune.json",
+    "llava": "${PATH}/Data",  # from llava v1.5
+}

ChatUniVi/config/model_config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+model_config_pretune = {
+    "use_cluster": True,
+    "freeze": False,
+    "vision_tune": False,
+    "spatial_cluster_rate0": 64,  # 0.25
+    "spatial_cluster_rate1": 32,  # 0.5
+    "spatial_cluster_rate2": 16,  # 0.5
+    "temporal_cluster_rate": 1/16,
+}
+model_config_finetune = {
+    "use_cluster": True,
+    "freeze": False,
+    "mm_tune": True,
+    "vision_tune": False,
+    "spatial_cluster_rate0": 64,  # 0.25
+    "spatial_cluster_rate1": 32,  # 0.5
+    "spatial_cluster_rate2": 16,  # 0.5
+    "temporal_cluster_rate": 1/16,
+}

ChatUniVi/constants.py ADDED Viewed

	@@ -0,0 +1,19 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+MAX_IMAGE_LENGTH = 64
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
+DEFAULT_VID_START_TOKEN = "<vid_start>"
+DEFAULT_VID_END_TOKEN = "<vid_end>"
+DEFAULT_BOX_START_TOKEN = "<box_start>"
+DEFAULT_BOX_END_TOKEN = "<box_end>"

ChatUniVi/conversation.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    ret.append([img_str, None])
+                    msg = msg.replace('<image>', '').strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+simple_qa = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions. "
+           " If you don't know the answer to a question, please don't share false information.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Is there a snowboard in the image?"),
+        ("Assistant", "Yes, there is a snowboard in the image, and the person is riding it down a snow-covered slope.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there! How can I help you today?")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+default_conversation = simple_conv
+conv_templates = {
+    "default": simple_conv,
+    "simple": simple_conv,
+    "simpleqa": simple_qa,
+    "v1": conv_v1,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ChatUniVi/demo.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+from .constants import *
+from .conversation import conv_templates, SeparatorStyle
+from .model.builder import load_pretrained_model
+from .utils import disable_torch_init
+from .mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
+from PIL import Image
+import os
+from decord import VideoReader, cpu
+import numpy as np
+class Chat:
+    def __init__(self, model_path, conv_mode="simple"):
+        disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, context_len = load_pretrained_model(model_path, None, model_name="ChatUniVi")
+        mm_use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(self.model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            self.tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        vision_tower = self.model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        self.image_processor = vision_tower.image_processor
+        self.conv_mode = conv_mode
+        print(self.model)
+    def get_prompt(self, qs, state):
+        state.append_message(state.roles[0], qs)
+        state.append_message(state.roles[1], None)
+        return state
+    def _get_rawvideo_dec(self, video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224,
+                          video_framerate=1, s=None, e=None):
+        if s is None:
+            start_time, end_time = None, None
+        else:
+            start_time = int(s)
+            end_time = int(e)
+            start_time = start_time if start_time >= 0. else 0.
+            end_time = end_time if end_time >= 0. else 0.
+            if start_time > end_time:
+                start_time, end_time = end_time, start_time
+            elif start_time == end_time:
+                end_time = start_time + 1
+        if os.path.exists(video_path):
+            vreader = VideoReader(video_path, ctx=cpu(0))
+        else:
+            print(video_path)
+            raise FileNotFoundError
+        fps = vreader.get_avg_fps()
+        f_start = 0 if start_time is None else int(start_time * fps)
+        f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+        num_frames = f_end - f_start + 1
+        if num_frames > 0:
+            sample_fps = int(video_framerate)
+            t_stride = int(round(float(fps) / sample_fps))
+            all_pos = list(range(f_start, f_end + 1, t_stride))
+            if len(all_pos) > max_frames:
+                sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+            else:
+                sample_pos = all_pos
+            patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+            return patch_images
+    @torch.inference_mode()
+    def generate(self, images_tensor: list, prompt: str, first_run: bool, state):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+        state = self.get_prompt(prompt, state)
+        prompt = state.get_prompt()
+        print(prompt)
+        images_tensor = torch.stack(images_tensor, dim=0)
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        temperature = 0.2
+        max_new_tokens = 1024
+        stop_str = conv_templates[self.conv_mode].copy().sep if conv_templates[self.conv_mode].copy().sep_style != SeparatorStyle.TWO else \
+        conv_templates[self.conv_mode].copy().sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images_tensor,
+                do_sample=True,
+                temperature=temperature,
+                num_beams=1,
+                max_new_tokens=max_new_tokens,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        print('response', outputs)
+        return outputs, state

ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
+                            "- The predicted answer must be factually accurate and align with the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the factual accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for correctness:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for detailed orientation.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the detailed-orientation score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
+                            "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for detailed orientation:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_3_context.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for contextual understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the contextual understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
+                            "- The predicted answer must capture the main themes and sentiments of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for contextual understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for temporal understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the temporal understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
+                            "- Evaluate the temporal accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score temporal understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for consistency.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question1 = qa_set['q1']
+        question2 = qa_set['q2']
+        answer = qa_set['a']
+        pred1 = qa_set['pred1']
+        pred2 = qa_set['pred2']
+        try:
+            # Compute the consistency score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
+                            "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
+                            "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
+                            "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
+                            "- Evaluate the consistency of the two predicted answers compared to the correct answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question 1: {question1}\n"
+                            f"Question 2: {question2}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer to Question 1: {pred1}\n"
+                            f"Predicted Answer to Question 2: {pred2}\n\n"
+                            "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ],
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        # video_id = sample['video_name']
+        video_id = 1
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question1 = sample['prompt_1']
+        question2 = sample['prompt_2']
+        answer = sample['answer']
+        pred1 = sample['text_1']
+        pred2 = sample['text_2']
+        qa_set = {"q1": question1, "q2": question2, "a": answer, "pred1": pred1, "pred2": pred2}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for consistency:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_gpt_review_visual.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import argparse
+import json
+import os
+import requests
+import openai
+import time
+NUM_SECONDS_TO_SLEEP = 0.5
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',  # gpt-4-0314
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    return response['choices'][0]['message']['content']
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+    review_file = open(f'{args.output}', 'a')
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()

ChatUniVi/eval/evaluate/evaluate_science_qa.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import argparse
+import json
+import os
+import re
+import random
+import numpy as np
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+if __name__ == "__main__":
+    args = get_args()
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+    sqa_results['NAT'] = []
+    sqa_results['SOC'] = []
+    sqa_results['LAN'] = []
+    sqa_results['TXT'] = []
+    sqa_results['IMG'] = []
+    sqa_results['NO'] = []
+    sqa_results['G1-6'] = []
+    sqa_results['G7-12'] = []
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            continue
+        pred = predictions[prob_id]
+        pred_text = pred['text']
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        res = pattern.findall(pred_text)
+        if len(res) == 1:
+            answer = res[0]  # 'A', 'B', ...
+        else:
+            answer = pred['pred']
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+            cur_result = 1
+        else:
+            results['incorrect'].append(analysis)
+            cur_result = 0
+        if prob['subject'] == 'natural science':
+            sqa_results['NAT'].append(cur_result)
+        elif prob['subject'] == 'social science':
+            sqa_results['SOC'].append(cur_result)
+        elif prob['subject'] == 'language science':
+            sqa_results['LAN'].append(cur_result)
+        if prob['hint']:
+            sqa_results['TXT'].append(cur_result)
+        if prob['image']:
+            sqa_results['IMG'].append(cur_result)
+        if not prob['hint'] and not prob['image']:
+            sqa_results['NO'].append(cur_result)
+        if prob['grade'] in ['grade1', 'grade2', 'grade3', 'grade4', 'grade5', 'grade6']:
+            sqa_results['G1-6'].append(cur_result)
+        elif prob['grade'] in ['grade7', 'grade8', 'grade9', 'grade10', 'grade11', 'grade12']:
+            sqa_results['G7-12'].append(cur_result)
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Subject NAT: {len(sqa_results["NAT"])}, Correct: {sum(sqa_results["NAT"])}, Accuracy: {np.mean(sqa_results["NAT"]) * 100:.2f}%')
+    print(f'Subject SOC: {len(sqa_results["SOC"])}, Correct: {sum(sqa_results["SOC"])}, Accuracy: {np.mean(sqa_results["SOC"]) * 100:.2f}%')
+    print(f'Subject LAN: {len(sqa_results["LAN"])}, Correct: {sum(sqa_results["LAN"])}, Accuracy: {np.mean(sqa_results["LAN"]) * 100:.2f}%')
+    print(f'Context Modality TXT: {len(sqa_results["TXT"])}, Correct: {sum(sqa_results["TXT"])}, Accuracy: {np.mean(sqa_results["TXT"]) * 100:.2f}%')
+    print(f'Context Modality IMG: {len(sqa_results["IMG"])}, Correct: {sum(sqa_results["IMG"])}, Accuracy: {np.mean(sqa_results["IMG"]) * 100:.2f}%')
+    print(f'Context Modality NO: {len(sqa_results["NO"])}, Correct: {sum(sqa_results["NO"])}, Accuracy: {np.mean(sqa_results["NO"]) * 100:.2f}%')
+    print(f'Grade G1-6: {len(sqa_results["G1-6"])}, Correct: {sum(sqa_results["G1-6"])}, Accuracy: {np.mean(sqa_results["G1-6"]) * 100:.2f}%')
+    print(f'Grade G7-12: {len(sqa_results["G7-12"])}, Correct: {sum(sqa_results["G7-12"])}, Accuracy: {np.mean(sqa_results["G7-12"]) * 100:.2f}%')
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)

ChatUniVi/eval/evaluate/evaluate_video_qa.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the correctness of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score and accuracy
+    score_sum = 0
+    count = 0
+    yes_count = 0
+    no_count = 0
+    for key, result in combined_contents.items():
+        # Computing score
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+        # Computing accuracy
+        pred = result[0]['pred']
+        if "yes" in pred.lower():
+            yes_count += 1
+        elif "no" in pred.lower():
+            no_count += 1
+    average_score = score_sum / count
+    accuracy = yes_count / (yes_count + no_count)
+    print("Yes count:", yes_count)
+    print("No count:", no_count)
+    print("Accuracy:", accuracy)
+    print("Average score:", average_score)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/summarize_gpt_review.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import json
+import os
+from collections import defaultdict
+import numpy as np
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=None)
+    parser.add_argument('-i', '--ignore', nargs='*', default=None)
+    return parser.parse_args()
+if __name__ == '__main__0':
+    args = parse_args()
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+    if args.files is not None and len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_'))]
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if args.ignore is not None and review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1))
+        print('=================================')
+if __name__ == '__main__':
+    args = parse_args()
+    scores = defaultdict(list)
+    print(f'GPT-4 vs. {args.dir}')
+    with open(args.dir) as f:
+        for review_str in f:
+            review = json.loads(review_str)
+            scores[review['category']].append(review['tuple'])
+            scores['all'].append(review['tuple'])
+    for k, v in scores.items():
+        stats = np.asarray(v).mean(0).tolist()
+        stats = [round(x, 3) for x in stats]
+        print(k, stats, round(stats[1]/stats[0]*100, 1))
+    print('=================================')

ChatUniVi/eval/model_coco_vqa.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from abc import ABC
+import numpy as np
+import jsonlines
+def get_acc(file):
+    acc, num = 0, 0
+    yes, no, fail = 0, 0, 0
+    tp, fp, fn, tn = 0, 0, 0, 0
+    with open(file, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            num += 1
+            if "Yes" in item["text"] or "yes" in item["text"]:
+                yes += 1
+                if "Yes" in item["label"] or "yes" in item["label"]:
+                    acc += 1
+                    tp += 1
+                else:
+                    fp += 1
+            elif "No" in item["text"] or "no" in item["text"]:
+                no += 1
+                if "No" in item["label"] or "no" in item["label"]:
+                    acc += 1
+                    tn += 1
+                else:
+                    fn += 1
+            else:
+                fail += 1
+    result = {
+        "acc": acc / num,
+        "yes": yes / num,
+        "no": no / num,
+        "fail": fail / num,
+        "precision": tp / (tp + fp),
+        "recall": tp / (tp + fn),
+    }
+    result["F1-score"] = 2 * result["precision"] * result["recall"] / (result["precision"] + result["recall"])
+    print("\n========================================================================")
+    print(file)
+    print(result)
+    print("========================================================================\n")
+    return result
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        try:
+            idx = line["question_id"]
+            image_file = line["image"]
+            qs = line["text"]
+            label = line["label"]
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            if args.answer_prompter:
+                with torch.inference_mode():
+                    output_ids = model.generate(
+                        input_ids,
+                        images=image_tensor.unsqueeze(0).half().cuda(),
+                        do_sample=True,
+                        temperature=args.temperature,
+                        top_p=args.top_p,
+                        num_beams=args.num_beams,
+                        max_new_tokens=1024,
+                        use_cache=True,
+                        stopping_criteria=[stopping_criteria]
+                    )
+                input_token_len = input_ids.shape[1]
+                n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+                outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+                outputs = outputs.strip()
+                if outputs.endswith(stop_str):
+                    outputs = outputs[:-len(stop_str)]
+                outputs = outputs.strip()
+                outputs_reasoning = outputs
+                input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' The answer is ', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+            else:
+                outputs_reasoning = ""
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria]
+                    )
+            scores = output_ids.scores[0][0].to(torch.float32)
+            label_score = []
+            candidates = ["yes", "Yes", "no", "No"]
+            for can in candidates:
+                can_id = tokenizer.encode(can)[-1]
+                label_score.append(scores[can_id].item())
+            outputs = candidates[np.argmax(label_score)]
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "outputs_reasoning": outputs_reasoning + ' The answer is ' + outputs,
+                                   "text": outputs,
+                                   "label": label,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing image file '{image_file}': {e}")
+    ans_file.close()
+    get_acc(answers_file)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simpleqa")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    parser.add_argument("--answer-prompter", action="store_true")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_consistency.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, slice_len
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        video_name = sample['video_name']
+        question_1 = sample['Q1']
+        question_2 = sample['Q2']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        qs = question_1
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=video_frames.half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs_1 = outputs.strip()
+        qs = question_2
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=torch.from_numpy(video_frames).half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs_2 = outputs.strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"video_name": sample['video_name'],
+                                    "prompt_1": question_1,
+                                    "text_1": outputs_1,
+                                    "prompt_2": question_2,
+                                    "text_2": outputs_2,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "answer": sample['A'],
+                                    "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_general.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, slice_len
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        video_name = sample['video_name']
+        sample_set = sample
+        qs = sample['Q']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        try:
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=video_frames.half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria])
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({'video_name': sample['video_name'],
+                                       "prompt": cur_prompt,
+                                       "text": outputs,
+                                       "answer_id": ans_id,
+                                       "model_id": model_name,
+                                       "answer": sample['A'],
+                                       "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing video file '{video_name}': {e}")
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_qa.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def read_json(file):
+    with open(file, "r", encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, slice_len
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_list = read_json(args.answers_list)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        sample_set = sample
+        qs = sample['question']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            video_name = sample['video_name']
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+            video_name = "v_" + sample['video_name']
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            if args.max_frames:
+                video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=args.max_frames)
+            else:
+                video_frames, slice_len = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        try:
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=video_frames.half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria])
+            output_ids = output_ids.sequences
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"video_name": sample['video_name'],
+                                       "prompt": cur_prompt,
+                                       "text": outputs,
+                                       "answer_id": ans_id,
+                                       "model_id": model_name,
+                                       "answer": sample['answer'],
+                                       "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing video file '{video_name}': {e}")
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-list", type=str, default="tables/answers_list.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_frames", type=int, default=None)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_vqa.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        image = Image.open(os.path.join(args.image_folder, image_file))
+        image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simple")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    parser.add_argument("--max_new_tokens", type=int, default=1024)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_vqa_scienceqa.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        gt_ans = line["conversations"][1]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+        if 'image' in line:
+            image_file = line["image"].replace("\\", "/")
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            images = image_tensor.unsqueeze(0).half().cuda()
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        outputs_reasoning = outputs
+        input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=64,
+                use_cache=True,
+                output_scores=True,
+                return_dict_in_generate=True,
+                stopping_criteria=[stopping_criteria])
+        scores = output_ids.scores[0][0].to(torch.float32)
+        label_score = []
+        candidates = []
+        answers_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+        for i in answers_list:
+            if "(" + i + ")" in cur_prompt:
+                candidates.append(i)
+        for can in candidates:
+            can_id = tokenizer.encode(can)[-1]
+            label_score.append(scores[can_id].item())
+        outputs_answer = candidates[np.argmax(label_score)]
+        output_ids = output_ids.sequences
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        outputs = outputs_reasoning + '\n The answer is ' + outputs
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "pred": outputs_answer,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simple")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_gpt4_answer.jsonl ADDED Viewed

	@@ -0,0 +1,90 @@

+{"question_id": 0, "text": "The colors of the two suitcases in the image are black and brown with yellow details.", "category": "conv"}
+{"question_id": 1, "text": "The image features two antique suitcases made of leather, stacked one on top of the other. The top suitcase is black, while the bottom one has a combination of brown and yellow colors. Both suitcases are prominently displayed side by side, taking up a large portion of the image. They appear to be vintage luggage, possibly available for purchase with price tags attached, showcasing their classic design and craftsmanship.", "category": "detail"}
+{"question_id": 2, "text": "Several factors could make the two stacked suitcases valuable. In the image, they are described as antique, which suggests that they are potentially rare or have some historical significance. Their age and rarity could contribute to their value. Additionally, they are made of leather, which is a material known for its durability and, in some cases, luxury. The suitcases also differ in color, featuring black, brown, and yellow tones, which may attract buyers due to their aesthetic appeal or the desire to own a unique piece of luggage. Furthermore, the presence of price tags on the luggage indicates that they are being sold, drawing attention to their perceived value in the market. Collectors or enthusiasts of vintage items may find these suitcases valuable due to their material, design, age, and distinct appearance.", "category": "complex"}
+{"question_id": 3, "text": "The main objects on the table in the image are plates, glasses, eating utensils, and possibly a bottle of something.", "category": "conv"}
+{"question_id": 4, "text": "The image showcases a dining table filled with various dirty dishes, eating utensils, and a bottle. The table has plates and glasses scattered across it. There are at least two cups on the table, with one positioned on the left and another on the right side. \n\nTwo forks can be seen, one resting on a small white plate and the other located towards the left side of the table. A spoon is also present, placed near the left corner of the table. Additionally, there is a bottle located in the upper central area of the table.", "category": "detail"}
+{"question_id": 5, "text": "A meal or gathering might have recently taken place around this table, as evidenced by the variety of dirty dishes, utensils, and drinkware present on it. The aftermath of this meal or event could involve the need to clean up and wash the dirty dishes, glasses, and eating utensils. Items on the table include plates, forks, spoons, cups, and a bottle, which are typically used during such occasions. Participants might have engaged in conversation, shared food and beverages, and enjoyed each other's company at this table. Once the gathering is complete, disposing of any leftovers and cleaning the table area would be necessary to prepare it for the next use.", "category": "complex"}
+{"question_id": 6, "text": "The main focus of the image is a cat or kitten sitting on or next to a laptop computer.", "category": "conv"}
+{"question_id": 7, "text": "The image features a cat sitting on a wooden desk, intently staring at a black laptop computer that is open in front of it. The desk is quite organized, with several books spread around, both on the left and right sides of the laptop. Some books are neatly piled and others are laid out individually. The scene could be interpreted as the cat appearing to use the laptop or showing interest in what's on the screen.", "category": "detail"}
+{"question_id": 8, "text": "The cat may be interested in the laptop due to the light coming from the screen, the movement of the cursor, or the sounds produced by the laptop, such as keystrokes or notifications. Cats are known to be naturally curious creatures and are often attracted to objects that stimulate their senses. The visual display, warmth emanating from the device, and interactive nature of the laptop could be appealing to the cat, prompting it to sit on the desk and face the open laptop. Additionally, the cat may simply be seeking its owner's attention or wanting to be close to them while they use the laptop.", "category": "complex"}
+{"question_id": 9, "text": "The dog in the image is black.", "category": "conv"}
+{"question_id": 10, "text": "The image features a large black dog with a red collar laying down on a kitchen floor. The tiled floor provides a comfortable space for the dog to relax. The dog is positioned towards the middle of the scene. Next to the dog, there are two bowls placed on the floor, one around the top-left side and the other slightly further to the right. These could be the dog's food and water bowls. The scene depicts a cozy domestic environment for the dog.", "category": "detail"}
+{"question_id": 11, "text": "The dog may have chosen to lay down on the kitchen floor in this location for several reasons. First, the tiled floor can provide a cool surface, which could be comforting compared to a warm or soft surface. This is particularly true if the dog has a thick coat or if the temperature in the room is high. Second, the dog is positioned next to its bowls and a toy, which suggests that the location is familiar and associated with food, water, and playtime. The dog could be resting near these resources for convenience and ease of access. Lastly, the location could be a central spot in the home, and dogs often prefer to stay near their owners or members of the household. By laying on the kitchen floor, the dog might be able to keep an eye on the activities happening around it and remain close to its family.", "category": "complex"}
+{"question_id": 12, "text": "The cart with luggage bags is located in a lobby.", "category": "conv"}
+{"question_id": 13, "text": "The scene is set in a lobby at a commercial business, where a wheeled luggage cart is filled with various suitcases and bags. There are a number of luggage bags, including two backpacks, one near the front of the cart and another near the back, as well as multiple suitcases placed on different sections of the cart. \n\nA handbag is also visible, placed near the front of the cart. Two people are present in the scene: one person standing near the left side and another person on the right side, possibly engaging in conversation or waiting to transport their luggage to their rooms.", "category": "detail"}
+{"question_id": 14, "text": "In the image, a cart is filled with numerous pieces of luggage, including suitcases, bags, and backpacks, in the lobby of a commercial business. The staff responsible for handling the luggage might face several challenges such as:\n\n1. Efficiently organizing and placing the luggage on the cart to prevent any damages or items from falling off during transportation.\n2. Identifying and keeping track of each guest's belongings to avoid misplacement or mixing up the luggage.\n3. Maneuvering the heavily loaded cart without causing hindrances to other guests in the lobby while ensuring their safety.\n4. Carrying or transporting the luggage to the guests' rooms in a timely manner to maintain customer satisfaction and efficient use of staff resources.\n\nTo overcome these challenges, staff might need proper training in luggage handling, organization, and customer service skills. Maintaining clear luggage tags and taking note of each item could help them ensure accurate delivery to the corresponding guests' rooms.", "category": "complex"}
+{"question_id": 15, "text": "The image features a large, placid lake.", "category": "conv"}
+{"question_id": 16, "text": "The image depicts a serene and picturesque landscape with a large, placid lake surrounded by beautiful tree-covered mountains. Cliffs rise majestically on the edge of the lake, showcasing the grandeur of nature. \n\nIn the distance, near the waterfront, a train can be seen traveling on a track. The train adds a touch of human presence to the otherwise untouched landscape and further enhances the scenic view.", "category": "detail"}
+{"question_id": 17, "text": "There could be several reasons for the train to be traveling on this scenic route surrounded by mountains, water, and trees. First, the train route might have been built to provide transportation services to local communities or businesses located in this remote or picturesque area. It may facilitate travel, trade, or tourism in the region. Second, the scenic route could be intentionally designed to attract tourists who seek leisure train journeys that showcase natural beauty and captivating landscapes. These types of train rides offer a unique and immersive experience for people who enjoy exploring breathtaking scenery and discovering new destinations while traveling comfortably. Ultimately, the train route could serve both practical and recreational purposes, fostering connectivity and promoting tourism for the area.", "category": "complex"}
+{"question_id": 18, "text": "The main object in the image is a colorful striped umbrella standing on the beach.", "category": "conv"}
+{"question_id": 19, "text": "The image features a colorful striped umbrella on the beach, with its vibrant appearance attracting attention. The umbrella is firmly positioned in the sand, providing shade and comfort for visitors. The umbrella sits near the ocean, offering a lovely view of the water and contributing to the pleasant beach atmosphere.", "category": "detail"}
+{"question_id": 20, "text": "The colorful umbrella set up on the beach serves several purposes. First, it provides shade from the sun, which helps protect people from direct sunlight and the associated harmful UV rays, reducing the risk of sunburn and heatstroke. Second, it offers a comfortable spot to relax, rest, and enjoy the beach ambiance without being exposed to the intense sun or heat. Finally, the umbrella creates a designated area for beach-goers to keep their belongings and have a visible landmark to locate their spot quickly when in the crowded beach environment.", "category": "complex"}
+{"question_id": 21, "text": "The house is in the beginning stages of its construction, with the wooden structure, studs, and staircase partially built but not yet completed.", "category": "conv"}
+{"question_id": 22, "text": "The image shows the beginning stages of a house under construction. The wooden structure is still being built, and there are lots of wooden studs visible throughout the scene, indicating work in progress. Additionally, a partially-built staircase can be seen within the house, giving an idea of the floor plan layout.\n\nDespite the house still being under construction, a bed and a TV are already present in the area. The bed is located near the center of the image, with the TV placed to its left. This shows that some household items have been brought in, probably in anticipation of the house's completion.", "category": "detail"}
+{"question_id": 23, "text": "In the current state of the house construction with wooden studs and a partially-built staircase, a potential obstacle that might occur is the incomplete structure being exposed to external elements like rain, snow, or strong winds. These conditions can potentially damage the building materials, delay construction work, and cause additional unexpected costs for repairing or replacing materials. It is essential for the builders to properly cover the construction site and store materials accordingly to protect them from potential damage when not in use.", "category": "complex"}
+{"question_id": 24, "text": "There are three pizzas in the image.", "category": "conv"}
+{"question_id": 25, "text": "The image displays three open pizza boxes with pizzas inside. The first pizza box, situated to the left, contains a smaller pizza. The second pizza box, positioned in the middle, features a larger pizza topped with tortilla chips. The third pizza box, located to the right, contains another smaller pizza. \n\nBeside the large pizza, there are two side orders of sauce in bowls, accompanying the delicious meal. The arrangement showcases a delicious spread of pizzas, inviting everyone to enjoy their different flavors and sizes.", "category": "detail"}
+{"question_id": 26, "text": "When dividing these pizzas equitably among a group of people, one might face several challenges due to the different sizes and potential toppings on the pizzas. In the image, there are three pizzas sitting in their delivery boxes, with one pizza being larger than the other two. This size difference may complicate equal distribution of pizza slices among the group, as some people might want more of the larger pizza, while others may prefer the smaller ones.\n\nFurthermore, one of the pizzas is topped with tortilla chips, which could lead to preference differences among the group, as some people might like this topping while others may not. Lastly, alongside one of the pizzas, there are two side orders of sauce, which may need to be shared among those who enjoy sauce with their pizza. All these factors make it a challenge to ensure that everyone in the group has an equal and satisfying share of the pizzas and accompanying sauces. One possible solution is to have a group discussion and decide on rules or a plan for dividing the pizzas, taking into account individual preferences and dietary restrictions if any.", "category": "complex"}
+{"question_id": 27, "text": "There are four doughnuts in the box.", "category": "conv"}
+{"question_id": 28, "text": "The image features a box filled with four different kinds of doughnuts sitting on a table. Each doughnut appears to have a unique design or topping. One of the doughnuts is a cake-style doughnut, while another has nuts and coconut on top. The third doughnut appears to be a glazed one, and the last doughnut also has various toppings. The doughnuts are positioned inside the box in close proximity to one another, showcasing their delicious variety.", "category": "detail"}
+{"question_id": 29, "text": "The box contains a variety of doughnuts with unique taste combinations. Among the four doughnuts, there are cake doughnuts, a glazed doughnut, and one with a mix of nuts and coconut as toppings. These different toppings and textures provide a diverse selection for those who want to try various flavors and types of doughnuts. Combining the flavors of nuts and coconut on one doughnut, in particular, offers a unique taste experience that blends the richness of nuts with the tropical sweetness of coconut. Thus, the box offers a selection that caters to different preferences and invites people to discover new taste combinations.", "category": "complex"}
+{"question_id": 30, "text": "In the image, the man is holding up clothes on hangers, possibly in some hanger bags, and appears to be preparing for travel, as a suitcase is also in front of him.", "category": "conv"}
+{"question_id": 31, "text": "The image depicts a man standing near a window, holding two clothing hangers with garments in both hands, one red and one white. In front of him, there is a suitcase, possibly indicating that he is preparing for travel or organizing his belongings. The man appears to be showing or comparing these clothes while packing his luggage.", "category": "detail"}
+{"question_id": 32, "text": "The man might be holding the clothes on hangers because he is preparing to pack his suitcase for a trip or has just arrived at his destination and is unpacking his belongings. By holding the clothes on hangers, he can avoid wrinkles and ensure that the garments remain in good condition for wearing during his journey or at his destination. The presence of a suitcase in front of him supports this idea, as it is commonly used for transportation and storage of personal belongings when traveling.", "category": "complex"}
+{"question_id": 33, "text": "There are four giraffes in the image.", "category": "conv"}
+{"question_id": 34, "text": "The image showcases a group of four giraffes standing in their natural habitat, a grassy field with some dry brush. They are dispersed across the field, with one giraffe positioned more towards the left, and the other three clustered closer together on the right side. The giraffes appear to be grazing peacefully or engaging in social interactions. A tree can be seen in the scene, providing shade and a point of interest for the giraffes to gather around.", "category": "detail"}
+{"question_id": 35, "text": "In the image, there are four giraffes gathered near the same tree in their natural habitat, a grass field with dry brush. Giraffes may gather near the same tree for various reasons. One common reason is to feed, as their long necks enable them to reach high branches for fresh leaves and foliage, which is a primary part of their diet. Giraffes are also social animals, so they may gather around the same tree to interact with each other and maintain their social bonds. Furthermore, the tree can provide shade in the hot African sun, offering a place for the giraffes to rest and regulate their body temperature.", "category": "complex"}
+{"question_id": 36, "text": "The main focus of the image is the head of a giraffe, shown in a close-up headshot.", "category": "conv"}
+{"question_id": 37, "text": "The image is a clear and colorful close up of a giraffe's head, showcasing its beautiful features. The giraffe is standing tall among several bright green trees, likely in its natural habitat. It appears to be chewing on some leaves, while its face is turned to the side, providing a detailed view of its profile.", "category": "detail"}
+{"question_id": 38, "text": "From the image, it can be inferred that the giraffe's habitat is likely a natural environment with an abundance of greenery. The giraffe is standing tall among several bright green trees, which suggests that the area is rich in vegetation and provides plenty of food sources for the giraffe, such as leaves. This natural habitat is likely suitable for giraffes to thrive and maintain their unique characteristics, such as their long necks, which are adapted for reaching leaves on tall trees.", "category": "complex"}
+{"question_id": 39, "text": "The art installation includes a display of large artworks, such as an oversized bench, painted feline heads, and potentially other artistic sculptures or images.", "category": "conv"}
+{"question_id": 40, "text": "The image features an art installation placed on a city street. The display consists of a large, oversized wooden bench and two massive animal art sculptures, which appear to be painted feline heads. \n\nThere are several people in the scene, with two people sitting near the art display, one person standing nearby, and another person further to the left. A parked car is visible behind the bench, close to the edge of the street. Additionally, a backpack can be seen on the ground, closer to the left side of the bench.", "category": "detail"}
+{"question_id": 41, "text": "The art installation, featuring an oversized wooden bench and two massive animal art sculptures, encourages interaction among people by providing a visually intriguing and inviting space to sit or gather. People are naturally drawn to the unique and whimsical nature of these installations, sparking curiosity and conversation among those who encounter them. The oversized bench provides seating for multiple individuals, prompting people to sit down, rest, or socialize while admiring the giant painted feline heads and other artworks on display. Additionally, the installations capture the attention of passersby, creating a shared experience and a sense of community among those who visit the area. The artistic environment stimulates discussion and appreciation of the art, facilitating social interactions and connections among those who gather near the sculptures and oversized bench.", "category": "complex"}
+{"question_id": 42, "text": "The main focus of the image is a car parked on the side of the road, next to a parking meter.", "category": "conv"}
+{"question_id": 43, "text": "The scene features a black car parked on the side of the road next to a parking meter. There is another parking meter slightly further ahead of the car. The car is parked in front of a building, which seems to be the destination for the driver. \n\nAdditionally, there are other vehicles in the image. A gray car is situated further behind the black car, and a truck can be seen in the background on the right side. Moreover, there is another vehicle barely visible on the left side of the scene.", "category": "detail"}
+{"question_id": 44, "text": "The most plausible reason for the car to be parked on the side of the road is that it is utilizing the available parking spot with parking meters. In the image, the car is parked next to two parking meters, which indicates that the driver has parked the car to follow parking regulations and likely paid for their parking session. The car might be parked there so the driver or passengers can go to a nearby building, shop, or participate in any other activities in the area. Parking meters are commonly used in urban and commercial areas to manage and regulate limited street parking spaces, and it's reasonable to assume the driver is abiding by these parking rules.", "category": "complex"}
+{"question_id": 45, "text": "The unusual aspect of the elderly man's facial appearance in the image is that he has the head of a toothbrush under his nose, which mimics a mustache, specifically a toothbrush mustache.", "category": "conv"}
+{"question_id": 46, "text": "The image features an elderly man humorously wearing the head of a toothbrush under his nose, creating the illusion of a toothbrush mustache. Since the toothbrush head is positioned like a mustache, it is reminiscent of a toothbrush mustache caricature often associated with Hitler. The man takes up most of the frame, appearing front and center in the scene.", "category": "detail"}
+{"question_id": 47, "text": "The elderly man wearing the head of a toothbrush under his nose as a moustache could be conveying a satirical or humorous message. By mimicking a toothbrush mustache, he may be making light of or criticizing a specific historical figure or style associated with the toothbrush mustache, such as Adolf Hitler. Using a toothbrush instead of an actual mustache allows him to dramatically emphasize the point and create a lighthearted or comical effect. However, it's essential to consider cultural and social contexts when interpreting such unconventional expressions, as the message may not always be clear or universally accepted.", "category": "complex"}
+{"question_id": 48, "text": "There are two large passenger airplanes visible in the image.", "category": "conv"}
+{"question_id": 49, "text": "The scene captures a picturesque view of two large passenger jets, one in the air and one on the ground. They are flying over a beautiful beach filled with various birds. Many birds can be seen scattered across the scene, both near the water and around the area. The blue water in the background adds a calming touch to the image.", "category": "detail"}
+{"question_id": 50, "text": "The presence of two large passenger jets flying in close proximity to the beach filled with birds can have a negative impact on the avian population. The noise generated by the engines can be stressful and disruptive for the birds, potentially resulting in altered behaviors, flight patterns, or even relocation from the area. In addition, the airplanes could pose a risk of bird strikes, in which birds collide with the planes, leading to injuries or fatalities for the birds and damage to the aircraft. This highlights the importance of maintaining safe distances between airplanes and wildlife habitats in order to minimize the risk of adverse effects on both the environment and aviation.", "category": "complex"}
+{"question_id": 51, "text": "The cat is curled up and sleeping on the red couch.", "category": "conv"}
+{"question_id": 52, "text": "A tan and black cat is curled up and comfortably asleep on a vibrant red couch. The cat is situated towards the center of the plush seat. The red couch itself is prominent in the image, occupying a large portion of the frame. The peaceful sleeping cat adds a sense of warmth and coziness to the scene.", "category": "detail"}
+{"question_id": 53, "text": "The cat may have chosen to sleep on the red couch for several reasons. The couch appears to be plush and comfortable, providing a soft surface for the cat to curl up and maintain its body heat. Cats also prefer quiet and low-traffic areas to rest. The couch may be located in such an area, providing a sense of security for the cat while sleeping. Furthermore, felines are known to be attracted to soft and warm materials, making the red velvet seat an appealing spot for relaxation. It's also common for cats to sleep in locations familiar to them or near their human companions, so the couch might be a place where the cat often spends time or feels safe.", "category": "complex"}
+{"question_id": 54, "text": "An old Volkswagen Bug is parked next to the stop sign in the image.", "category": "conv"}
+{"question_id": 55, "text": "In the scene, a woman is standing in front of a window, observing the street outside. On the street, a vintage Volkswagen Bug is parked near a stop sign, which is located towards the center of the image. Several other cars occupy the street, including one positioned to the left of the Volkswagen and two others in the background. On the far left and far right sides of the image, there are additional cars partially visible. The street also features a tree growing near the stop sign, adding a touch of greenery to the urban setting.", "category": "detail"}
+{"question_id": 56, "text": "From the image, it can be inferred that the traffic situation in the area is relatively light or calm. There is an old VW Bug parked near a stop sign, and a few other cars are also visible on the street. However, there doesn't seem to be any congestion or significant traffic. The presence of the stop sign indicates that there is some traffic regulation and order in place to manage vehicle movement, contributing to the overall calmness of the area. Moreover, no people are seen near the stop sign, which further suggests that there might be low pedestrian activity at the time the picture was taken.", "category": "complex"}
+{"question_id": 57, "text": "The man in the image is using his cell phone and giving a thumbs up, possibly posing for a picture.", "category": "conv"}
+{"question_id": 58, "text": "The image depicts a group of people gathered inside a room, creating a lively social atmosphere. A man is prominently featured in the scene, holding a cell phone to his ear and smiling with a thumbs-up gesture, possibly posing for a picture or simply enjoying the conversation.\n\nThere are multiple other people in the room, adding to the gathering and engaging in various conversations. The scene also has two cups visible, one on the left side of the room and another on the right side, indicating that the gathering may be a casual social event.", "category": "detail"}
+{"question_id": 59, "text": "It is likely that the man is giving a thumbs-up gesture while talking on the phone to express a positive emotion or to signal agreement, approval, or satisfaction with the conversation he is having. This non-verbal communication can be conveyed to the people around him or to the person taking the picture of him while on the phone call. The man might be receiving good news, agreeing with the person on the phone, or sharing his excitement with the group of people surrounding him in the room. The specific reason cannot be precisely determined from the image alone, but it does indicate a positive sentiment associated with the phone call.", "category": "complex"}
+{"question_id": 60, "text": "The dog in the image is a husky.", "category": "conv"}
+{"question_id": 61, "text": "The image shows a large, adorable husky dog sleeping peacefully on a dog bed in a room. The room has a somewhat dark ambiance, making the scene feel cozy and comfortable. \n\nIn the same room, there are two chairs, one positioned towards the center and another one closer to the right side. Additionally, there are two potted plants, one situated slightly behind and to the right of the dog and the other placed further to the right in the room. The presence of these elements gives the room a warm, inviting atmosphere.", "category": "detail"}
+{"question_id": 62, "text": "The dog owner seems to have taken several measures to make their dog feel comfortable in the room. Based on the image, an adorable husky dog has a designated dog bed to sleep in, providing cushioning and a cozy space for rest. The room is also dimly lit or dark, which contributes to a calmer atmosphere that promotes sleep and relaxation. There might be other considerations, such as room temperature or air circulation, that aid in creating a comfortable environment for the dog. It's essential for pet owners to ensure that their pets have a secure and comfortable space to rest in order to maintain their well-being and happiness.", "category": "complex"}
+{"question_id": 63, "text": "The color of the elephant in the image is grey.", "category": "conv"}
+{"question_id": 64, "text": "The image depicts a baby elephant standing alone inside an enclosure. The elephant is positioned towards the center of the pen with a large sand dune, or dirt mount, nearby. In the background, there is a brick building that serves as part of the exhibit. The grey elephant stands calmly, occupying the majority of the scene in this pen.", "category": "detail"}
+{"question_id": 65, "text": "A potential concern for the baby elephant standing alone inside the pen could be the lack of social interaction with other elephants. Elephants are highly social creatures and form strong bonds within their family units. The absence of other elephants in the enclosure might lead to feelings of loneliness, stress, and anxiety for the baby elephant. It is crucial for the mental and emotional well-being of elephants, especially young ones, to have regular opportunities to interact with their peers and family members in a naturalistic, supportive environment. By addressing this concern, the caretakers of the baby elephant can help ensure its psychological well-being and overall health.", "category": "complex"}
+{"question_id": 66, "text": "On the orange plate, there are a burger or sandwich, a bowl of broccoli, and a pickle.", "category": "conv"}
+{"question_id": 67, "text": "The image features a dining table with an orange plate in the center. On the plate, there is a half of a sandwich, a pickle, and a bowl of broccoli, creating a visually appealing and mouth-watering combination. A fork is also placed alongside the plate, ready to be used for enjoying the meal. Several individual pieces of broccoli can be seen around the bowl, adding variety and color to the scene.", "category": "detail"}
+{"question_id": 68, "text": "Based on the image, it can be inferred that the individual has a balanced approach to their dietary choices. The meal on the table consists of a sandwich, a bowl of broccoli, and a pickle on an orange plate. The sandwich likely provides carbohydrates and proteins, while the broccoli offers essential vitamins, minerals, and fiber. The pickle adds some flavor and tanginess to the meal. This combination of foods illustrates that the person is interested in having a diverse and nutritious diet, incorporating both proteins and vegetables in their meal. Additionally, the presence of a fork suggests that they prefer using cutlery while eating, which can be seen as a more cultured and organized approach to dining.", "category": "complex"}
+{"question_id": 69, "text": "A white duck is pictured in the image.", "category": "conv"}
+{"question_id": 70, "text": "The image features a single white bird, likely a duck or a goose, walking along a paved road. The bird appears to be closely examining the ground as it walks. The road runs adjacent to a patch of grass, providing a contrasting backdrop for the scene. The bird occupies a predominant part of the image, offering a close-up view of its action.", "category": "detail"}
+{"question_id": 71, "text": "While walking along the paved road, the duck may face various challenges. Firstly, the duck could be at risk of encountering vehicles, which might not notice or expect to see wildlife on the road. This could put the duck in danger of being hit or injured by passing traffic. Secondly, the duck might struggle to find appropriate food sources on or near the paved road, as opposed to its natural habitat, such as a body of water or a grassy area, where it would have easier access to its regular diet.\n\nFurthermore, the duck might be exposed to potential predators, like domestic pets or other animals that live in urban environments, increasing the risk of harm for the duck. Lastly, walking on the paved concrete surface could be less comfortable for the duck than walking on softer natural ground, causing discomfort over time. Overall, the duck's safety and well-being may be compromised by navigating this environment that is not adapted to its natural behaviors and needs.", "category": "complex"}
+{"question_id": 72, "text": "Yes, the little boy is wearing a helmet to ensure his safety while riding his bike.", "category": "conv"}
+{"question_id": 73, "text": "The image depicts a little boy riding his bike on a road. He is practicing safety by wearing a yellow helmet while cycling. The boy is visible from a side angle, capturing both him and his bicycle in motion.", "category": "detail"}
+{"question_id": 74, "text": "The little boy's precaution of wearing a helmet is noteworthy because it shows that the child or their guardian is aware of and strictly adhering to safety precautions. Wearing a helmet while riding a bicycle is crucial for reducing the risk of severe head injuries in the event of a crash, fall, or any other unforeseen incident. It is especially important for children, who are often more prone to accidents due to their limited experience with balancing and negotiating obstacles. By wearing a helmet, the boy is demonstrating responsible behavior and setting a positive example for others to follow, promoting a culture of safety awareness.", "category": "complex"}
+{"question_id": 75, "text": "There are three people visible in the image - two women and one man.", "category": "conv"}
+{"question_id": 76, "text": "The image shows a group of people at a celebration, likely participating in a wine tasting event. There are a total of six people, with three young adults prominently in the foreground. A man wearing glasses stands in the middle, flanked by a couple of women on either side. They are all holding wine glasses, enjoying the event and smiling.\n\nAt the table, there are several wine bottles dispersed along its surface, showing the variety of options available for tasting. Additionally, there is a handbag placed on the table, possibly belonging to one of the women attending the event. \n\nTwo more people are visible in the background. They are smiling and making conversation while engaging in the celebration with the others.", "category": "detail"}
+{"question_id": 77, "text": "It appears that the group of people are gathered for a wine tasting or celebration in which they are enjoying various types of wine. They are seen standing and sitting around a table with several wine bottles and holding wine glasses. Their smiles and the friendly atmosphere indicate that it's a social event where they have gathered to indulge in sampling different wines, engage in conversations, and enjoy each other's company.", "category": "complex"}
+{"question_id": 78, "text": "The main object featured in the image is the clock mounted on a pole or a short tower, standing on the sidewalk next to a building.", "category": "conv"}
+{"question_id": 79, "text": "The image features a quiet street with a tall clock mounted on a pole at the side of the pavement, near a building. The pole stands prominently on the empty sidewalk, offering a convenient time-keeping source for pedestrians.\n\nThere are a few potted plants along the sidewalk, adding a touch of greenery to the scene. Bicycles can also be seen parked nearby, and a couple of cars are parked further down the street. A person is walking in the background, adding some life to the otherwise calm environment.", "category": "detail"}
+{"question_id": 80, "text": "As you walk down this quiet street, you can easily tell the time by looking at the tall clock that is mounted on a pole along the sidewalk. This clock stands out significantly, making it easily noticeable. Regarding the direction of the sidewalk, the presence of potted plants, bicycles, and cars suggest that this is a well-maintained and organized street. As you follow the sidewalk further, you will see that it is lined with trees, potted plants, and parked bicycles and cars, guiding you in the direction it leads. Additionally, the building next to the clock also gives you context for the orientation and direction of the street.", "category": "complex"}
+{"question_id": 81, "text": "The person is riding a motorbike, which can also be referred to as a motorcycle or a moped. In this case, the terms motorcycle, scooter, and moped are used interchangeably to describe the same vehicle the woman is riding.", "category": "conv"}
+{"question_id": 82, "text": "The scene showcases a dirt road, with a woman riding a motorcycle in the middle of the road. She is accompanied by a man walking alongside her, with another man further down the road. Various animals, including three cows, are also present in the scene, suggesting a rural setting. The main cow is positioned to the right of the motorcycle while two smaller cows can be seen on the left side of the road. The people and animals all appear to be going in the same direction down the dirt road.", "category": "detail"}
+{"question_id": 83, "text": "The woman on the motorcycle might face challenges navigating the dirt road, especially because of the presence of multiple objects and living beings on the road. In the image, a cow is nearby, along with two other smaller cows, two men, and a dog. Dirt roads can be uneven, dusty, and unpredictable, making it important for the woman to maneuver cautiously to avoid any potential accidents. Additionally, she needs to be aware of the animals and people around her to ensure both her safety and the safety of others. This situation requires her to be vigilant, slow down when necessary, and carefully navigate through the environment.", "category": "complex"}
+{"question_id": 84, "text": "The skateboard in the image is in an upside-down position, with its wheels pointing up and laying on the ground.", "category": "conv"}
+{"question_id": 85, "text": "In the image, a person wearing camo shorts is standing on a cement surface next to an overturned skateboard. The skateboard is laying upside down with its wheels pointing up, giving a sense of it having been flipped or potentially used for a trick. The boy appears to be standing in front of the skateboard, engaging with it or possibly preparing for his next move.", "category": "detail"}
+{"question_id": 86, "text": "It's plausible that prior to this moment, the person standing next to the overturned skateboard might have attempted a trick or maneuver on the skateboard, which resulted in either a failed attempt or a wipeout. As a consequence, the skateboard ended up upside down on the ground, with its wheels pointed upward. The person, possibly the skateboarder, is standing next to it, possibly pondering their next move, contemplating a retry or assessing any potential damage to the skateboard.", "category": "complex"}
+{"question_id": 87, "text": "The man and woman are wearing umbrella hats, which are designed to provide protection from the rain.", "category": "conv"}
+{"question_id": 88, "text": "The image depicts a man and a woman sitting on top of a bridge, both wearing umbrella hats. The quirky umbrella hats provide them with some cover from the rainy weather. A dog is accompanying the couple, taking a break with them on the bridge. \n\nThe surrounding area has a view of the ocean, and there are boats visible in the water. One boat can be found close to the right edge of the scene, while two other boats are in the middle and the far left side of the frame. Additional umbrellas can be spotted in the area, signifying that other pedestrians have also sought shelter from the rain.", "category": "detail"}
+{"question_id": 89, "text": "On this rainy day, the umbrella hats provide the couple several benefits as they sit on top of a bridge near the ocean with their dog. First, the umbrella hats enable them to keep their hands free, allowing them to hold their dog's leash, interact with each other, and perform other tasks without holding a traditional umbrella. Second, the umbrella hats protect their heads and upper bodies from getting wet due to rain, making their outdoor experience more comfortable. Additionally, the novel and fun nature of the umbrella hats can be a conversation starter and bring some humor to their day despite the gloomy weather.", "category": "complex"}

ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_questions.jsonl ADDED Viewed

	@@ -0,0 +1,90 @@

+{"question_id": 0, "image": "COCO_val2014_000000441147.jpg", "text": "What is the color of the two suitcases in the image?", "category": "conv"}
+{"question_id": 1, "image": "COCO_val2014_000000441147.jpg", "text": "Analyze the image in a comprehensive and detailed manner.", "category": "detail"}
+{"question_id": 2, "image": "COCO_val2014_000000441147.jpg", "text": "What potential factors could make these suitcases valuable?", "category": "complex"}
+{"question_id": 3, "image": "COCO_val2014_000000353536.jpg", "text": "What are the main objects on the table in the image?", "category": "conv"}
+{"question_id": 4, "image": "COCO_val2014_000000353536.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 5, "image": "COCO_val2014_000000353536.jpg", "text": "What activity might have recently taken place around this table and what could be its aftermath?", "category": "complex"}
+{"question_id": 6, "image": "COCO_val2014_000000506095.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 7, "image": "COCO_val2014_000000506095.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 8, "image": "COCO_val2014_000000506095.jpg", "text": "What could be a reason for the cat's interest in the laptop?", "category": "complex"}
+{"question_id": 9, "image": "COCO_val2014_000000514915.jpg", "text": "What color is the dog in the image?", "category": "conv"}
+{"question_id": 10, "image": "COCO_val2014_000000514915.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 11, "image": "COCO_val2014_000000514915.jpg", "text": "Why might the dog have chosen to lay down on the floor in this location?", "category": "complex"}
+{"question_id": 12, "image": "COCO_val2014_000000056013.jpg", "text": "Where is the cart with luggage bags located?", "category": "conv"}
+{"question_id": 13, "image": "COCO_val2014_000000056013.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 14, "image": "COCO_val2014_000000056013.jpg", "text": "What challenges might the staff encounter with handling luggage in the lobby?", "category": "complex"}
+{"question_id": 15, "image": "COCO_val2014_000000408439.jpg", "text": "What is the body of water seen in the image?", "category": "conv"}
+{"question_id": 16, "image": "COCO_val2014_000000408439.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 17, "image": "COCO_val2014_000000408439.jpg", "text": "What are potential reasons for the train to be traveling in such a scenic route?", "category": "complex"}
+{"question_id": 18, "image": "COCO_val2014_000000164255.jpg", "text": "What is the main object in the image?", "category": "conv"}
+{"question_id": 19, "image": "COCO_val2014_000000164255.jpg", "text": "What do you think is going on in this snapshot?", "category": "detail"}
+{"question_id": 20, "image": "COCO_val2014_000000164255.jpg", "text": "What purpose does the umbrella serve on the beach?", "category": "complex"}
+{"question_id": 21, "image": "COCO_val2014_000000034096.jpg", "text": "What stage is the house in regarding its construction?", "category": "conv"}
+{"question_id": 22, "image": "COCO_val2014_000000034096.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 23, "image": "COCO_val2014_000000034096.jpg", "text": "What may be a potential obstacle for the house construction in its current state?", "category": "complex"}
+{"question_id": 24, "image": "COCO_val2014_000000385873.jpg", "text": "How many pizzas are there in the image?", "category": "conv"}
+{"question_id": 25, "image": "COCO_val2014_000000385873.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 26, "image": "COCO_val2014_000000385873.jpg", "text": "What are the possible challenges one might face when dividing these pizzas equitably among a group of people?", "category": "complex"}
+{"question_id": 27, "image": "COCO_val2014_000000367571.jpg", "text": "How many doughnuts are in the box?", "category": "conv"}
+{"question_id": 28, "image": "COCO_val2014_000000367571.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 29, "image": "COCO_val2014_000000367571.jpg", "text": "What unique taste combinations can be found in this box of doughnuts?", "category": "complex"}
+{"question_id": 30, "image": "COCO_val2014_000000319432.jpg", "text": "What is the man doing in the image?", "category": "conv"}
+{"question_id": 31, "image": "COCO_val2014_000000319432.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 32, "image": "COCO_val2014_000000319432.jpg", "text": "Why might the man be holding the clothes on hangers?", "category": "complex"}
+{"question_id": 33, "image": "COCO_val2014_000000225738.jpg", "text": "How many giraffes are depicted in the image?", "category": "conv"}
+{"question_id": 34, "image": "COCO_val2014_000000225738.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 35, "image": "COCO_val2014_000000225738.jpg", "text": "Why might these giraffes be gathering near the same tree?", "category": "complex"}
+{"question_id": 36, "image": "COCO_val2014_000000092109.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 37, "image": "COCO_val2014_000000092109.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 38, "image": "COCO_val2014_000000092109.jpg", "text": "What can be inferred about the giraffe's habitat from this image?", "category": "complex"}
+{"question_id": 39, "image": "COCO_val2014_000000506483.jpg", "text": "What kind of objects are included in the art installation?", "category": "conv"}
+{"question_id": 40, "image": "COCO_val2014_000000506483.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 41, "image": "COCO_val2014_000000506483.jpg", "text": "How might the art installation encourage interaction among people?", "category": "complex"}
+{"question_id": 42, "image": "COCO_val2014_000000097131.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 43, "image": "COCO_val2014_000000097131.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 44, "image": "COCO_val2014_000000097131.jpg", "text": "What might be the reason for the car to be parked on the side of the road?", "category": "complex"}
+{"question_id": 45, "image": "COCO_val2014_000000052312.jpg", "text": "What is unusual about the elderly man's facial appearance in the image?", "category": "conv"}
+{"question_id": 46, "image": "COCO_val2014_000000052312.jpg", "text": "What's happening in the scene?", "category": "detail"}
+{"question_id": 47, "image": "COCO_val2014_000000052312.jpg", "text": "What message could be conveyed by the man's unusual facial adornment?", "category": "complex"}
+{"question_id": 48, "image": "COCO_val2014_000000258285.jpg", "text": "How many airplanes are visible in the image?", "category": "conv"}
+{"question_id": 49, "image": "COCO_val2014_000000258285.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 50, "image": "COCO_val2014_000000258285.jpg", "text": "What environmental impact might these planes' activity have on the birds?", "category": "complex"}
+{"question_id": 51, "image": "COCO_val2014_000000081552.jpg", "text": "What is the cat doing on the red couch?", "category": "conv"}
+{"question_id": 52, "image": "COCO_val2014_000000081552.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 53, "image": "COCO_val2014_000000081552.jpg", "text": "What factors may have led the cat to choose this location for sleep?", "category": "complex"}
+{"question_id": 54, "image": "COCO_val2014_000000066144.jpg", "text": "What type of car is parked next to the stop sign in the image?", "category": "conv"}
+{"question_id": 55, "image": "COCO_val2014_000000066144.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 56, "image": "COCO_val2014_000000066144.jpg", "text": "What can be inferred about the traffic situation in the area?", "category": "complex"}
+{"question_id": 57, "image": "COCO_val2014_000000203629.jpg", "text": "What is the man doing in the image?", "category": "conv"}
+{"question_id": 58, "image": "COCO_val2014_000000203629.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 59, "image": "COCO_val2014_000000203629.jpg", "text": "What might be the reason behind the man's thumbs-up gesture while on the phone?", "category": "complex"}
+{"question_id": 60, "image": "COCO_val2014_000000109532.jpg", "text": "What breed is the dog in the image?", "category": "conv"}
+{"question_id": 61, "image": "COCO_val2014_000000109532.jpg", "text": "What are the key elements in this picture?", "category": "detail"}
+{"question_id": 62, "image": "COCO_val2014_000000109532.jpg", "text": "What measures may be taken by the dog's owner to make the dog feel comfortable in the room?", "category": "complex"}
+{"question_id": 63, "image": "COCO_val2014_000000431165.jpg", "text": "What is the color of the elephant in the image?", "category": "conv"}
+{"question_id": 64, "image": "COCO_val2014_000000431165.jpg", "text": "What do you think is going on in this snapshot?", "category": "detail"}
+{"question_id": 65, "image": "COCO_val2014_000000431165.jpg", "text": "What could be a potential concern for the baby elephant in this setting?", "category": "complex"}
+{"question_id": 66, "image": "COCO_val2014_000000020650.jpg", "text": "What type of food items are present on the orange plate?", "category": "conv"}
+{"question_id": 67, "image": "COCO_val2014_000000020650.jpg", "text": "What are the key elements in this picture?", "category": "detail"}
+{"question_id": 68, "image": "COCO_val2014_000000020650.jpg", "text": "What can be inferred about the individual's dietary choices?", "category": "complex"}
+{"question_id": 69, "image": "COCO_val2014_000000205183.jpg", "text": "What type of bird is pictured in the image?", "category": "conv"}
+{"question_id": 70, "image": "COCO_val2014_000000205183.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 71, "image": "COCO_val2014_000000205183.jpg", "text": "What challenges could the duck face while walking along the paved road?", "category": "complex"}
+{"question_id": 72, "image": "COCO_val2014_000000210299.jpg", "text": "Is the little boy wearing any safety gear while riding his bike?", "category": "conv"}
+{"question_id": 73, "image": "COCO_val2014_000000210299.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 74, "image": "COCO_val2014_000000210299.jpg", "text": "Why is the little boy's precaution noteworthy?", "category": "complex"}
+{"question_id": 75, "image": "COCO_val2014_000000515716.jpg", "text": "How many people are visible in the image?", "category": "conv"}
+{"question_id": 76, "image": "COCO_val2014_000000515716.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 77, "image": "COCO_val2014_000000515716.jpg", "text": "What might be the reason behind this gathering?", "category": "complex"}
+{"question_id": 78, "image": "COCO_val2014_000000460149.jpg", "text": "What is the main object featured in the image?", "category": "conv"}
+{"question_id": 79, "image": "COCO_val2014_000000460149.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 80, "image": "COCO_val2014_000000460149.jpg", "text": "Imagine you are walking down this street; how would you know the time or which direction the sidewalk leads?", "category": "complex"}
+{"question_id": 81, "image": "COCO_val2014_000000293505.jpg", "text": "Is the person riding a motorbike or scooter?", "category": "conv"}
+{"question_id": 82, "image": "COCO_val2014_000000293505.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 83, "image": "COCO_val2014_000000293505.jpg", "text": "What challenges might the woman on the motorcycle face in this situation?", "category": "complex"}
+{"question_id": 84, "image": "COCO_val2014_000000525439.jpg", "text": "What is the position of the skateboard in the image?", "category": "conv"}
+{"question_id": 85, "image": "COCO_val2014_000000525439.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 86, "image": "COCO_val2014_000000525439.jpg", "text": "What might have happened prior to this moment?", "category": "complex"}
+{"question_id": 87, "image": "COCO_val2014_000000534270.jpg", "text": "What type of hats are the man and woman wearing?", "category": "conv"}
+{"question_id": 88, "image": "COCO_val2014_000000534270.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 89, "image": "COCO_val2014_000000534270.jpg", "text": "What benefits do umbrella hats provide to the couple on this rainy day?", "category": "complex"}

ChatUniVi/eval/questions/coco_pope/coco_pope_adversarial.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/coco_pope/coco_pope_popular.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/coco_pope/coco_pope_random.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/scienceqa/pid_splits.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/scienceqa/test_QCM-LEA.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/activitynet_a_list.json ADDED Viewed

	@@ -0,0 +1,2105 @@

+[
+  "no",
+  "yes",
+  "day",
+  "outdoor",
+  "good looking",
+  "bit dangerous",
+  "secondary",
+  "simple",
+  "much simpler",
+  "more difficult",
+  "difficulty",
+  "low",
+  "unsafe",
+  "sunny",
+  "cloudy",
+  "2",
+  "1",
+  "5",
+  "3",
+  "4",
+  "6",
+  "11",
+  "7",
+  "8",
+  "female",
+  "male",
+  "indoor",
+  "shopkeepers and customers",
+  "friend",
+  "brother",
+  "bath towel",
+  "tie rope",
+  "combat training",
+  "play football on table",
+  "chipping potatoes",
+  "dishwashing",
+  "skateboard with water",
+  "defoliation",
+  "brush ones teeth",
+  "baton tennis",
+  "play ball",
+  "performing martial arts",
+  "play",
+  "dance",
+  "gargle",
+  "slider",
+  "wine dressing",
+  "rowing",
+  "pacify horse",
+  "drum",
+  "cook dish",
+  "do floor exercises",
+  "match",
+  "gymnastics in parallel bars",
+  "be referee",
+  "dishware",
+  "skin care",
+  "train",
+  "hammer throw",
+  "draw eyes shadow",
+  "wash horse",
+  "car wash",
+  "play musical instrument",
+  "kayaking",
+  "skiing",
+  "make cake",
+  "sweep snow for cars",
+  "paracentesis",
+  "drink",
+  "practice martial arts",
+  "make cold dishes",
+  "contain water",
+  "put shot",
+  "bathe horse",
+  "making cocktails",
+  "clean room",
+  "wipe glass",
+  "shovel snow",
+  "make spaghetti",
+  "dig up soil",
+  "cut tomato",
+  "tattoo",
+  "boxing",
+  "bodybuilding",
+  "sailing competition",
+  "hopping hula hoop",
+  "somersault",
+  "paint fingernails",
+  "make sandwich",
+  "introduce tool",
+  "walk dog",
+  "make ice cream",
+  "manicure",
+  "make hair",
+  "brushing horse",
+  "wrapping paper",
+  "play game",
+  "wear shoe",
+  "play with sand",
+  "chopping wood",
+  "smear sunscreen",
+  "weightlifting",
+  "do some washing",
+  "assemble bicycle",
+  "bathe dog",
+  "shoe polish",
+  "fire",
+  "sharpening",
+  "wear contact lenses",
+  "parade",
+  "brush shoes",
+  "cleaning deciduous leaves",
+  "wash clothes",
+  "play tennis",
+  "clink with dog",
+  "tires dismantling",
+  "snow removal",
+  "squeezed juice",
+  "play table tennis",
+  "run",
+  "diving",
+  "decorating christmas tree",
+  "shooting",
+  "pruning branches",
+  "javelin throw",
+  "shuttlecock",
+  "wallpaper",
+  "practice stick method",
+  "control remote control ship",
+  "assembling tires",
+  "sweeping snow",
+  "sailing",
+  "cut nails for cat",
+  "assembly slate",
+  "push ice pot",
+  "physical exercise",
+  "assemble christmas tree",
+  "hair removal",
+  "surfing",
+  "making lollipops",
+  "bungee jumping",
+  "weed",
+  "handcraft",
+  "clean glass",
+  "loading chain",
+  "play magic cube",
+  "landscaping",
+  "play volleyball",
+  "play harmonica",
+  "cooking",
+  "shave legs",
+  "taking something",
+  "perform",
+  "drinking",
+  "play weeding machine",
+  "clean desk",
+  "make drink",
+  "peeling sweet potatoes",
+  "licensing",
+  "pole vault",
+  "bowling",
+  "skip lattices",
+  "introduction of lawn mower",
+  "hula hoop",
+  "play bounce shoes",
+  "yoga teaching",
+  "throwing discus",
+  "balance beam",
+  "flute",
+  "fly kite",
+  "leaping",
+  "finger guessing game",
+  "blow sax",
+  "comb ones hair",
+  "make face",
+  "weave hair",
+  "chat",
+  "sunscreen",
+  "photograph",
+  "trailing ground",
+  "iron clothes",
+  "chipping",
+  "ride camel",
+  "demonstrating",
+  "riding",
+  "make light",
+  "paint fence",
+  "making sandwiches",
+  "discus throwing",
+  "gymnastics on horizontal bar",
+  "water gliding",
+  "shot put",
+  "kicking sand",
+  "fertilizer application",
+  "strap rope skipping",
+  "play baseball",
+  "interview",
+  "sit on ground",
+  "be pulled forward",
+  "shave",
+  "teaching",
+  "play computer games",
+  "play football",
+  "teaching martial arts",
+  "play jumper",
+  "help to cut hair",
+  "table tennis",
+  "play bumper car",
+  "skateboard",
+  "bike ride",
+  "paddle",
+  "tire repair",
+  "long jump",
+  "swimming",
+  "shine shoes",
+  "broadcast news",
+  "scavenging leaves",
+  "sail",
+  "training dog",
+  "use of vacuum cleaner",
+  "wrist wrestling",
+  "blow ones hair",
+  "field shooting",
+  "ride on bicycle",
+  "rock climbing",
+  "play slide",
+  "high jump",
+  "drive car",
+  "brush wall",
+  "skateboarding",
+  "slippery pulley",
+  "playing harmonica",
+  "cook",
+  "play beach volleyball",
+  "explain ice hockey",
+  "pray",
+  "play basketball",
+  "tug of war",
+  "dismantling gifts",
+  "football match",
+  "laying tile",
+  "explain",
+  "play with smarthphone",
+  "hair care",
+  "spruce up",
+  "gift of gift",
+  "throwing darts",
+  "play tai chi",
+  "play parallel bars",
+  "turn magic cube",
+  "draw",
+  "watch tv",
+  "dog",
+  "dog and cat",
+  "competition archery",
+  "throwing javelin",
+  "bicycle race",
+  "throw hammer",
+  "throwing discuses",
+  "training",
+  "mountain bike ride",
+  "ice curling",
+  "exercise gymnastics",
+  "water polo",
+  "turn somersault",
+  "gymnastics",
+  "throw ball",
+  "wrestling",
+  "enhanced collaboration",
+  "black",
+  "yellow",
+  "dodgeball",
+  "eat ice cream",
+  "haystack",
+  "kindling wood",
+  "smoking",
+  "tree planting",
+  "curly",
+  "make skis",
+  "play violin",
+  "grooming",
+  "shoes polish",
+  "cut persons head",
+  "by cable car",
+  "wire walking",
+  "do yoga",
+  "masonry wall",
+  "clean dog",
+  "push car",
+  "rope skipping",
+  "leather shoes",
+  "measurement length",
+  "car repair tires",
+  "pruning trees",
+  "tread on stilts",
+  "paint chair",
+  "tilling",
+  "dance hip hop",
+  "drink beer",
+  "dive to feed fish",
+  "windsurfing",
+  "boxing practice",
+  "haircut",
+  "ride bicycle",
+  "do shape packing",
+  "play piano",
+  "play hockey",
+  "eat something",
+  "rubbing feet",
+  "sleep",
+  "play with phone",
+  "light roof",
+  "ride bike",
+  "wash hands",
+  "play guitar",
+  "shoe rope",
+  "throw spear",
+  "fight",
+  "play music",
+  "prepare for dive",
+  "singing",
+  "tread",
+  "face",
+  "hockey game",
+  "helmet",
+  "hat",
+  "play billiards",
+  "knit sweater",
+  "make salad",
+  "use stick",
+  "skiing leather ring",
+  "light cigarette",
+  "tree pruning",
+  "tire change",
+  "performing gymnastics",
+  "play roller skating",
+  "wheel change",
+  "dance tai chi sword",
+  "wrists",
+  "be manicure",
+  "make sandwich biscuits",
+  "reaction capacity",
+  "gray",
+  "blue",
+  "green",
+  "red",
+  "white",
+  "black and white",
+  "orange",
+  "pink",
+  "purple",
+  "brown",
+  "golden",
+  "dynamic bicycle",
+  "white glove",
+  "glove",
+  "rest",
+  "lying on ground",
+  "introduce people",
+  "motorcycle jump",
+  "jump up",
+  "take cell phone",
+  "play with mud",
+  "and then drums",
+  "audience left",
+  "audience came to court",
+  "raise hand to celebrate",
+  "cheer",
+  "throw snowball",
+  "to ground",
+  "video end",
+  "audience applauded",
+  "then shovel snow",
+  "jump into sand pit",
+  "talk to passer by",
+  "run up",
+  "serve",
+  "take ball",
+  "go on court",
+  "introduction of products",
+  "drink drink",
+  "clean horse",
+  "lens shaky",
+  "end of demonstration",
+  "wash hair",
+  "start",
+  "add ice",
+  "prepare for start",
+  "shedding rope",
+  "check rope",
+  "take skates",
+  "get ready",
+  "coach guidance",
+  "referee throws the ball",
+  "cat peppermint",
+  "prepare starting",
+  "shoulder shoulder",
+  "showcase wine",
+  "blower",
+  "blow up drum",
+  "squat",
+  "almost hit by car",
+  "production process",
+  "button button",
+  "be interviewed",
+  "go out",
+  "down",
+  "all down",
+  "christmas tree fell down",
+  "hole",
+  "diver",
+  "plain white cloth",
+  "national flag",
+  "hair drier",
+  "white woman",
+  "wall column",
+  "visual acuity chart",
+  "shovel",
+  "wall",
+  "sofa",
+  "living room",
+  "tree",
+  "grassland",
+  "iron block",
+  "house",
+  "automobile",
+  "mirror",
+  "ground",
+  "billboard",
+  "enclosure",
+  "television",
+  "window",
+  "blue plastic film",
+  "hair stylist",
+  "wooden doors",
+  "small house",
+  "table",
+  "shelf",
+  "coloured lights",
+  "small window",
+  "fountain",
+  "audience",
+  "glass",
+  "blue screen",
+  "spectator seats",
+  "guardrail net",
+  "screen",
+  "cabinet",
+  "kitchen",
+  "wood",
+  "door",
+  "vehicle",
+  "christmas tree",
+  "chair",
+  "yellow bag",
+  "glass cabinet",
+  "two pieces of exhibition board",
+  "fish",
+  "goods shelves",
+  "treadmill",
+  "window curtains",
+  "snow mountain",
+  "mountain",
+  "car",
+  "stand",
+  "practice basketball",
+  "experimentalize",
+  "guitar",
+  "table football",
+  "bread",
+  "rope",
+  "men in white",
+  "microphone",
+  "poker",
+  "computer",
+  "disk",
+  "judges",
+  "ball net",
+  "drum kit",
+  "cobra",
+  "ball table",
+  "stone",
+  "sand",
+  "shoal of fish",
+  "white coat man",
+  "pool",
+  "road",
+  "water",
+  "tennis table",
+  "railing",
+  "tea table",
+  "cat",
+  "flower receptacle",
+  "fire pile",
+  "men in blue",
+  "cake",
+  "bowling pin",
+  "floor",
+  "hillside",
+  "white man",
+  "ironing table",
+  "basketball box",
+  "man in blue",
+  "table tennis table",
+  "food",
+  "network",
+  "red clothes",
+  "cruise",
+  "light pole",
+  "sound",
+  "agitation machine",
+  "man in camouflage",
+  "kitchen sink",
+  "two cattles",
+  "court",
+  "football field",
+  "man in white",
+  "wine bottle",
+  "basket",
+  "piano",
+  "beer",
+  "timer",
+  "counter",
+  "wash basin",
+  "bottle",
+  "ball",
+  "wrist watch",
+  "trash",
+  "steel pipe",
+  "model",
+  "row of tires",
+  "bunker",
+  "switch",
+  "stool",
+  "swimming pool",
+  "bracelet",
+  "spoon",
+  "milk",
+  "knife",
+  "surround masses",
+  "auditorium",
+  "silverware cadets",
+  "piano frame",
+  "fence",
+  "man in a suit",
+  "table and chair",
+  "horse",
+  "guardrail",
+  "referee",
+  "glass table",
+  "sea",
+  "wine glass",
+  "material science",
+  "seasoning",
+  "card",
+  "ice cream",
+  "tool",
+  "bowl",
+  "down by sally gardens",
+  "spray painting",
+  "club signs",
+  "help man shave",
+  "relax",
+  "decompression",
+  "physical fitness",
+  "exercise muscles",
+  "relieve stress",
+  "laugh",
+  "put on shoes",
+  "lingual nail",
+  "cut roof tin",
+  "gymnastic mat",
+  "siberian husky",
+  "poodle",
+  "cooking noodles",
+  "make egg roll",
+  "collar exercises",
+  "transparent",
+  "much much safer",
+  "much higher",
+  "christmas",
+  "s",
+  "e",
+  "exercise bike",
+  "skate",
+  "lawn",
+  "cement floor",
+  "horizontal bar",
+  "surf board",
+  "circular site",
+  "woman in white",
+  "water basin",
+  "trampoline",
+  "rim",
+  "volleyball court",
+  "blue dress man",
+  "garden",
+  "carrot",
+  "pads",
+  "vacuum cleaner",
+  "cooking bench",
+  "notes",
+  "juicer",
+  "blanket",
+  "bathing pool",
+  "curling runway",
+  "wheel",
+  "bar counter",
+  "protective net",
+  "opponent",
+  "contestant",
+  "hopping viaduct",
+  "pool table",
+  "curly hair",
+  "pet dog",
+  "hairstylist",
+  "coach",
+  "nail polish",
+  "make up",
+  "take exercise",
+  "play darts",
+  "water spray",
+  "tell about",
+  "phone",
+  "gymnastics competition",
+  "show turns eye",
+  "guard net",
+  "ski",
+  "drawing board",
+  "welding",
+  "buoy",
+  "watch",
+  "hair hoop",
+  "headset",
+  "flying discses",
+  "gambling",
+  "explain games rule",
+  "high",
+  "security",
+  "danger",
+  "volleyball",
+  "long distance run",
+  "twenty one points",
+  "tennis sports",
+  "ice hockey movement",
+  "parallel bars movement",
+  "three step jump",
+  "ice hockey",
+  "705",
+  "mouthpiece",
+  "budweiser",
+  "wrench",
+  "ceiling",
+  "sky",
+  "parallel bars exercise",
+  "boxing match",
+  "wiping cupboard",
+  "seal carpets",
+  "poweriser",
+  "hair rope",
+  "wash socks",
+  "toilet paper",
+  "braided hair",
+  "pastry master",
+  "charlie adams",
+  "decorator",
+  "colleague",
+  "partner",
+  "master and servant",
+  "mother and daughter",
+  "teammate",
+  "whole family",
+  "family",
+  "spouse",
+  "smoke friend",
+  "doctor patient relationship",
+  "teaching relationship",
+  "father and son",
+  "father and daughter",
+  "lovers",
+  "classmate",
+  "dancing partner",
+  "video player",
+  "row of fences",
+  "lower",
+  "low grade",
+  "higher",
+  "much more dangerous",
+  "bit safe",
+  "protect head",
+  "commonly",
+  "winter",
+  "circular",
+  "cylinder",
+  "trapezoid",
+  "square",
+  "rectangle",
+  "t shirt",
+  "long",
+  "snow",
+  "roof",
+  "runway",
+  "introduction equipment",
+  "clear",
+  "preside over program",
+  "hand drums",
+  "fitness equipment",
+  "archery",
+  "relatives",
+  "in repair",
+  "practice ball",
+  "make biscuits",
+  "how to wear contact lenses",
+  "introduction of cutting tools",
+  "how to set up cars",
+  "billiard game",
+  "rules of playing cards",
+  "food program",
+  "yoga mat",
+  "coaster",
+  "board",
+  "paper",
+  "lake",
+  "stage",
+  "inverted reflection in water",
+  "rubber runway",
+  "hovercraft",
+  "necklace",
+  "labrador",
+  "short",
+  "blue jeans",
+  "jeans",
+  "jump",
+  "jumping project",
+  "gym shoes",
+  "boots",
+  "artistic gymnastics",
+  "night",
+  "bar",
+  "home",
+  "basketball court",
+  "gym",
+  "hockey field",
+  "suburb",
+  "barber shop",
+  "beach",
+  "natatorium",
+  "casino",
+  "in street",
+  "snow land",
+  "bedroom",
+  "grass",
+  "parking lot",
+  "toilet",
+  "studio",
+  "park",
+  "hall",
+  "dancing room",
+  "playground",
+  "street",
+  "hotel",
+  "tattoo shop",
+  "shower room",
+  "sandy beach",
+  "room",
+  "rooftop",
+  "cave",
+  "stadium",
+  "plain",
+  "desk",
+  "ski field",
+  "tennis court",
+  "baseball field",
+  "lakeside",
+  "laboratory",
+  "church",
+  "amusement park",
+  "racetrack",
+  "horse yard",
+  "training ground",
+  "supermarket",
+  "office",
+  "market",
+  "tennis room",
+  "bowling room",
+  "manicure shop",
+  "game place",
+  "seaside",
+  "aquatic",
+  "chess and card room",
+  "left",
+  "left hand",
+  "black man",
+  "stimulate",
+  "reduce drag",
+  "give yourself inertia",
+  "increase jump distance",
+  "poor balance",
+  "for beauty",
+  "avoid injury",
+  "for sake of beauty",
+  "beautiful",
+  "anti damage",
+  "avoid hurt somenon",
+  "beauty",
+  "keep health",
+  "entertainment",
+  "convenient",
+  "wonderful performance",
+  "prevent drowning",
+  "woman",
+  "cylindrical",
+  "triangle",
+  "ellipse",
+  "round",
+  "dragon boat",
+  "hard to catch fire",
+  "2 people",
+  "skiing air cushion",
+  "sedan",
+  "good",
+  "wooden house",
+  "scuba diving",
+  "sunglasses",
+  "drifting",
+  "braid of double twist",
+  "out of flatness",
+  "slipper",
+  "stripe",
+  "suit",
+  "lattice",
+  "cleanser",
+  "cutting machine",
+  "air conditioner",
+  "two little funny birds",
+  "sapling",
+  "blonde",
+  "spider man toy",
+  "vest",
+  "tango",
+  "villa",
+  "wooden boat",
+  "springboard",
+  "slide",
+  "cardboard",
+  "protective pad",
+  "tigers head",
+  "plastic bag",
+  "onion",
+  "introduction of bullfighting",
+  "painting",
+  "game",
+  "take step",
+  "do aerobics",
+  "learn diving",
+  "soup spoon",
+  "english",
+  "banana",
+  "sax",
+  "workbench",
+  "drifting air cushion",
+  "pedal",
+  "bridge frame",
+  "sponge pad",
+  "child seat",
+  "bicycle",
+  "earth",
+  "seaweed",
+  "sealing layer",
+  "marble",
+  "bed",
+  "lifting jack",
+  "brand",
+  "sailboat",
+  "ponytail",
+  "dress",
+  "take deep breath",
+  "pommeled horse",
+  "shoulder hair",
+  "butterfly",
+  "peaked cap",
+  "sweet canister",
+  "new type lawn mower",
+  "fixator",
+  "scientific research ship",
+  "sandals",
+  "leather boots",
+  "blue shorts",
+  "funny video",
+  "b",
+  "pvc material",
+  "roast fish",
+  "basketball match",
+  "sitting",
+  "summer",
+  "cheerleading",
+  "spider",
+  "evening",
+  "afternoon",
+  "axe",
+  "movement type",
+  "racing moto",
+  "remote control boat",
+  "2012",
+  "halloween",
+  "casinos",
+  "carpet",
+  "track",
+  "field",
+  "park path",
+  "bread machine",
+  "yard",
+  "workshop",
+  "meadow",
+  "circle",
+  "railway bridge",
+  "river",
+  "restaurant",
+  "doorway",
+  "opera",
+  "track and field ground",
+  "sports meeting",
+  "subway",
+  "bullring",
+  "head",
+  "cudgel court",
+  "bathroom",
+  "bicycle shop",
+  "guide room",
+  "prairie track",
+  "snowfield",
+  "camel",
+  "seabed",
+  "aged",
+  "tropic",
+  "japan",
+  "srb",
+  "left eye",
+  "right eye",
+  "thumb",
+  "ones right hand",
+  "right",
+  "right hand",
+  "ger",
+  "leather gloves",
+  "second dives",
+  "left foot",
+  "her father",
+  "india",
+  "silverware",
+  "red jersey",
+  "blonde man",
+  "ring finger",
+  "catch fish",
+  "dive",
+  "pregnant",
+  "to throw far away",
+  "motion",
+  "sparring",
+  "to throw shot",
+  "be happy with drum",
+  "happy",
+  "get rid of blues",
+  "in washing up",
+  "some water on hands",
+  "wear jump shoes",
+  "in fight",
+  "someones good at skipping rope",
+  "for entertainment",
+  "win",
+  "easy to shave",
+  "make darts",
+  "labor saving",
+  "darts are easy to targets",
+  "express happiness",
+  "score",
+  "win in gambling",
+  "be covered by people",
+  "blindfolded eyes",
+  "to challenge themselves",
+  "prevent skidding",
+  "express joy",
+  "speed up javelin",
+  "more beautiful",
+  "give discus inertia",
+  "to express courtesy",
+  "guide children",
+  "blower is too heavy",
+  "greeting",
+  "cold prevention",
+  "end",
+  "be more beautiful",
+  "decorating house",
+  "trees are not neat",
+  "protect",
+  "protect eye",
+  "hungry",
+  "avoid messing",
+  "thanks to audience",
+  "easy eat",
+  "arm force",
+  "take part",
+  "do public welfare",
+  "reduce friction",
+  "increase difficulty",
+  "keep healthy",
+  "burning fire",
+  "convenient weeding",
+  "express victory",
+  "to dive",
+  "scoop water",
+  "record results",
+  "speed up",
+  "jump far",
+  "unsteadiness of gravity",
+  "curettage",
+  "protect waist",
+  "eatting",
+  "make car light",
+  "make manicure easier",
+  "squeezing apple",
+  "test light bulb",
+  "car dirty",
+  "mouth leaking",
+  "clothing leveling",
+  "be attacked",
+  "interest",
+  "easy blow",
+  "fixed thread",
+  "farther of the jump",
+  "angry",
+  "have personality",
+  "avoid freezing hands",
+  "protect teeth",
+  "pick up speed",
+  "safety guarantee",
+  "you can jump farther",
+  "others car broke down",
+  "because long jump",
+  "to blow sax",
+  "put on hat",
+  "reduce center of gravity",
+  "hamper ball",
+  "decorate",
+  "acne treatment",
+  "encouragement",
+  "balance",
+  "challenging",
+  "bullet dodging",
+  "interesting",
+  "cheerful mood",
+  "bring eggs together",
+  "imbalance of body",
+  "satisfy greed",
+  "thank",
+  "put child down",
+  "drink green liquid",
+  "let people pass through",
+  "pursuit of stimulation",
+  "better growth",
+  "excitement",
+  "to win",
+  "ball inertia",
+  "halloween decoration",
+  "active atmosphere",
+  "keep clean",
+  "protect their heads",
+  "good growth of plants",
+  "almost power",
+  "maintain level",
+  "dinner",
+  "encourage each other",
+  "increase speed of shot",
+  "irritation",
+  "tire broken",
+  "work",
+  "art of display",
+  "healthy",
+  "prevent cramps",
+  "fear tree be frozen",
+  "convenient engraving",
+  "clean",
+  "ranking",
+  "teaching video",
+  "dress up",
+  "test",
+  "record video",
+  "video recording",
+  "improve skills",
+  "recording time",
+  "stick wallpaper",
+  "accident prevention",
+  "clean and beautiful",
+  "convenient imitation",
+  "convenient viewing",
+  "greeting new year",
+  "beautify environment",
+  "well done",
+  "art",
+  "keep weight",
+  "prevent being injury",
+  "to be beautiful",
+  "sport project",
+  "competition",
+  "make cake tasty",
+  "bowl is dirty",
+  "for sunscreen",
+  "for safety",
+  "dirty",
+  "filter out water",
+  "grass is too high",
+  "anger",
+  "to express comfort",
+  "game lost",
+  "strengthen proficiency",
+  "record performance",
+  "fear of hindering people",
+  "curling",
+  "challenge stimulus",
+  "tired out",
+  "more practice",
+  "inertia",
+  "end performance",
+  "brain is damaged",
+  "guessing victory",
+  "escape",
+  "because of smoking",
+  "because of tattoo pain",
+  "large density",
+  "strong body",
+  "because its done",
+  "experience",
+  "goal",
+  "celebrate victory",
+  "express excitement",
+  "because you want to dive",
+  "fixed ceramic tile",
+  "easy understand",
+  "avoid enzyme change",
+  "for eatting",
+  "convenient for walking",
+  "prevent fall",
+  "accelerate",
+  "air rotation",
+  "back somersault",
+  "take oil",
+  "table collapsed",
+  "assembled rear tire",
+  "tie up hair",
+  "open your hands",
+  "shave sideburns",
+  "upside up",
+  "turn head",
+  "step back",
+  "chilli powder",
+  "stride left foot",
+  "straighten back",
+  "cut off",
+  "anjali mudra",
+  "look at effect",
+  "active wrist",
+  "toys have been raised",
+  "waving",
+  "teaching by step",
+  "pulp",
+  "grasp rope",
+  "be pulled down",
+  "nearly fall",
+  "hand in hand",
+  "stand up",
+  "wave",
+  "pine",
+  "return to place",
+  "fell to ground",
+  "wear sunglasses",
+  "skiing down hill",
+  "wash dishes",
+  "and put it back again",
+  "fall on air cushion",
+  "lose ball",
+  "close scissors",
+  "changing right hand",
+  "rotary handlebar",
+  "turn on light",
+  "put sweet potato in bowls",
+  "electrified",
+  "vomit",
+  "fall",
+  "help him cut his hair",
+  "fall into water",
+  "bend",
+  "stretch leg",
+  "take off gloves",
+  "put into water tank",
+  "start dancing",
+  "well cooked",
+  "unable to stand firmly",
+  "slide down slope",
+  "put on leg",
+  "black pants swimmers",
+  "go on running",
+  "kicking doll",
+  "be cuddle",
+  "go out of room",
+  "puff out smoke",
+  "start of the game",
+  "control by foot",
+  "celebrate",
+  "pick up",
+  "stir",
+  "tissue",
+  "bartender",
+  "automative lighting",
+  "audiences stand",
+  "star shaped toy",
+  "water sled",
+  "red cloth",
+  "curtain",
+  "safety net",
+  "pumpkin shed",
+  "oxygen bottle",
+  "big bottle",
+  "spider man",
+  "desert",
+  "personal information",
+  "painted wall",
+  "red curtain",
+  "both hands",
+  "lot of boxes",
+  "steps",
+  "bookcase",
+  "oven",
+  "shed",
+  "telegraph pole",
+  "ornament",
+  "white walls",
+  "performers",
+  "dresser",
+  "water tap",
+  "magic cube",
+  "pot",
+  "toaster oven",
+  "tennis",
+  "bullfighting field",
+  "bet",
+  "grinder machine",
+  "scoreboard",
+  "machine",
+  "pull rope",
+  "launch",
+  "boiler",
+  "potted plant",
+  "photo",
+  "minions",
+  "ring",
+  "blue sneakers",
+  "big tree",
+  "refrigerator",
+  "line",
+  "yellow paint",
+  "iron chain",
+  "patch",
+  "glasses",
+  "dolly",
+  "hu la hoop",
+  "safety hat",
+  "plastic flowers",
+  "lamp",
+  "curling brush",
+  "bushes",
+  "long hair man",
+  "child",
+  "saw",
+  "plate",
+  "yellow basin",
+  "men in black",
+  "street lamp",
+  "spare tire",
+  "tattoo artist",
+  "dry powder plate",
+  "black hair woman",
+  "barbell",
+  "earring",
+  "parallel bar",
+  "large building",
+  "ceramic tile",
+  "person wearing a hat",
+  "pond",
+  "row of clothes",
+  "iron",
+  "masonry",
+  "ladder",
+  "cameraman",
+  "canvas",
+  "shrimp meat",
+  "hand washing box",
+  "awning",
+  "stairs",
+  "american flag",
+  "socket",
+  "cosmetics",
+  "clothes",
+  "romaine lettuce",
+  "make body pliable",
+  "rabbit",
+  "red baffle",
+  "horse running yard",
+  "personnel",
+  "banana tree",
+  "british flag",
+  "very excited",
+  "red ball",
+  "shore",
+  "release pressure",
+  "purify air",
+  "exercise balance",
+  "9",
+  "exercise",
+  "make report",
+  "yelling",
+  "eradicator",
+  "mat cloth",
+  "chopping block",
+  "stone and water grass",
+  "seawater",
+  "crew cut",
+  "cold",
+  "red and black",
+  "wash milk",
+  "height",
+  "clockwise",
+  "anti clockwise",
+  "forward",
+  "up and down",
+  "front",
+  "5000 meters",
+  "oxygen mask",
+  "blue shirt",
+  "very good",
+  "supply lemon juice",
+  "clean and tidy",
+  "fixed hairstyle",
+  "straw",
+  "front rollover",
+  "cry sadly",
+  "very angry",
+  "smile",
+  "more fair",
+  "its pretty",
+  "eager",
+  "fat",
+  "a",
+  "w",
+  "t",
+  "c",
+  "cucumber",
+  "woodiness",
+  "mustache",
+  "chopped wood",
+  "ball frame",
+  "target",
+  "gambling table",
+  "bag",
+  "cross bar",
+  "equip them with modern machines",
+  "rock",
+  "craft",
+  "wood pile",
+  "protective effect",
+  "do horizontal bar gymnastics",
+  "do homework",
+  "main committee",
+  "origami",
+  "asymmetrical bars",
+  "bob",
+  "shoulder length hair",
+  "single cauda",
+  "short hair",
+  "lime",
+  "electromagnetic furnace",
+  "clerk",
+  "athletes",
+  "reporter",
+  "p",
+  "pumpkin",
+  "matador",
+  "barber",
+  "cooker",
+  "fitness coach",
+  "decorating umbrella",
+  "basin",
+  "group of children",
+  "twinkle",
+  "gymnastic performance",
+  "beautiful arched eyebrows",
+  "hosting news",
+  "variety of wine",
+  "sandwich",
+  "skateboard of water skiing",
+  "sing",
+  "hold girl",
+  "exercise weight lifting",
+  "mowing lawn",
+  "drift",
+  "shoeshine",
+  "cross country race",
+  "women in red clothes",
+  "rag",
+  "ride cross country motorcycle",
+  "bullfight",
+  "broadcast",
+  "black hair girl",
+  "1154",
+  "female host",
+  "how to cut potatoes",
+  "very pleased",
+  "ballet dancer",
+  "long rod",
+  "four thousand and six hundred",
+  "first",
+  "korea",
+  "walk",
+  "recreational machines",
+  "29",
+  "1951",
+  "330",
+  "1144",
+  "knock toy",
+  "spinning",
+  "mans head",
+  "cut tree",
+  "standing",
+  "party",
+  "brush",
+  "practice water sled",
+  "package book",
+  "ski slope",
+  "teach hair",
+  "camera",
+  "apple juice",
+  "short hair baby",
+  "half squat",
+  "hold right leg",
+  "squatting",
+  "lead singer",
+  "doctor",
+  "trainer",
+  "safe of car",
+  "life buoy",
+  "sandbag",
+  "woven goods",
+  "pepper",
+  "companion",
+  "partnership",
+  "stranger",
+  "competitor",
+  "shopkeepers and shopkeepers",
+  "brother and sister",
+  "shopkeeper and customer",
+  "masses and journalists",
+  "boss and the customer",
+  "referee and player",
+  "row of cars",
+  "hemp rope",
+  "forest",
+  "green brush",
+  "casement window",
+  "black hat man",
+  "poisoning",
+  "much much higher",
+  "much safer",
+  "removal of bubbles",
+  "fixed hair",
+  "natural lashes",
+  "control direction",
+  "10",
+  "floating ice",
+  "colorful",
+  "red and white",
+  "silver",
+  "ballet",
+  "take off shoes",
+  "lift her up",
+  "shaving cutter head",
+  "scissors",
+  "carding",
+  "take off long rope",
+  "round to other side",
+  "stop",
+  "hand ring",
+  "photo display",
+  "pull dog down",
+  "two circles of rotation",
+  "rolling skin",
+  "feed bread",
+  "drumstick",
+  "pocket knife",
+  "take off your shirt",
+  "appearance of name",
+  "press ceramic tile",
+  "flexibility",
+  "team consciousness",
+  "throw away tissue",
+  "scarf",
+  "basketball",
+  "hairpin",
+  "branches and leaves fell",
+  "release white smoke",
+  "rugby",
+  "prayers",
+  "make phone",
+  "gymnastics suit",
+  "right handed rod",
+  "lift truck",
+  "unload barbell",
+  "feathered pat",
+  "physical energy",
+  "plug igniter",
+  "pen",
+  "straw hat",
+  "training skills",
+  "harmonica",
+  "plastic track",
+  "flat shovel",
+  "enjoy",
+  "life jacket",
+  "pine soil",
+  "movement of hand",
+  "wristwatch",
+  "hands on ability",
+  "needle",
+  "obstacle",
+  "vaulting horse",
+  "podium",
+  "yellow ironing cloth",
+  "achievement is effective",
+  "coordination",
+  "man in green",
+  "avoid opponents ball",
+  "wear ballet shoes",
+  "clap hands",
+  "hold together",
+  "continue down",
+  "embrace",
+  "shoeshine surface",
+  "black container",
+  "put cup down",
+  "vertical thumb",
+  "dog catches the plate",
+  "then training",
+  "running",
+  "clapping",
+  "listen to coach",
+  "twist and a smile",
+  "pinch with perturbed",
+  "fixed with adhesive paper",
+  "boy comes down",
+  "snow fall",
+  "do bike ride",
+  "mans hand",
+  "ball goal net",
+  "need some help to correct",
+  "cup was removed",
+  "ball is bounced back",
+  "camel came over",
+  "pulverized people lead",
+  "fly back to roof",
+  "stop timing",
+  "blow dry",
+  "play with toys",
+  "change tire",
+  "fixed tire",
+  "remove rubber ring",
+  "drink milk tea",
+  "go to black dog",
+  "bow",
+  "shoelace",
+  "powdery base",
+  "cup fell into water",
+  "blue team serve",
+  "get home",
+  "blood flow",
+  "car scrubbing",
+  "take out chips",
+  "smooth and smooth",
+  "treatment of chili",
+  "start training",
+  "discus collision",
+  "perm hand",
+  "tutoring cadets",
+  "eat ball",
+  "dog planer",
+  "wine precipitation",
+  "start skiing",
+  "wash ones face",
+  "clenching fist",
+  "assemble cake",
+  "clip eyelash",
+  "push people",
+  "brace with feet",
+  "take off faucet",
+  "grab ball",
+  "lay by small tree",
+  "stop roller skating",
+  "do edge decoration",
+  "nail clipper",
+  "look at blade",
+  "evade automatically",
+  "snowball",
+  "slide out of site",
+  "punching",
+  "open ones eyes",
+  "go down table",
+  "blonde womens car wash",
+  "splash",
+  "look computer desktop",
+  "board stands up",
+  "inserting needle",
+  "keep shaving",
+  "leave",
+  "take out gun",
+  "turn around",
+  "be pulled by ship",
+  "tapping bucket",
+  "cilping some lemon slices",
+  "bow to lens",
+  "eat up",
+  "tie shoelaces",
+  "throw ball out",
+  "shoes getting clean",
+  "throw it to wall",
+  "regulating nozzle",
+  "start tug of war",
+  "frying eggs",
+  "adhesive tape",
+  "blow up",
+  "go back to walk",
+  "shot switching",
+  "continue to grind knife",
+  "hit each other",
+  "hand down",
+  "grey team celebrates",
+  "draw rubber rafts",
+  "push left dolls",
+  "dog ran away",
+  "wait for",
+  "speak",
+  "block gliding",
+  "walk away",
+  "put slice of lemon",
+  "look javelin distance",
+  "cooling",
+  "eliminated",
+  "smoothly",
+  "water is flooded",
+  "get up to pay",
+  "change pair of shoes",
+  "go ashore",
+  "flushing razor",
+  "emotional excitement",
+  "throw snow on edge",
+  "pour out washing powder",
+  "start bungee jumping",
+  "takeoff",
+  "packing",
+  "got third",
+  "darkened",
+  "drink water",
+  "caterpillar wallpaper",
+  "lose match",
+  "cow fell down",
+  "inside car",
+  "lid pressure",
+  "throw barbell",
+  "screwing nut",
+  "fabric line",
+  "hold child",
+  "cook noodles",
+  "continue to split wood",
+  "clean horses hind leg",
+  "very happy",
+  "man jump down",
+  "cutting wallpaper",
+  "wear good left eye",
+  "start jumping",
+  "punches",
+  "speech",
+  "exhibition",
+  "garden cart",
+  "dismantling light bulb",
+  "go to diver",
+  "step by step",
+  "sit down on your knees",
+  "preparation materials",
+  "get off car",
+  "let curling round legs",
+  "wear gloves",
+  "fermation",
+  "shaving cutter",
+  "put down saw",
+  "pony fell down",
+  "raise ones hands",
+  "turn two laps",
+  "open ones mouth",
+  "explanation usage",
+  "pull hand brake",
+  "floating on water",
+  "open sound",
+  "cut cake",
+  "air pressure",
+  "go to woman",
+  "introduce",
+  "prepare weapon",
+  "sweep floor",
+  "prepare bar",
+  "dipping pigment",
+  "count",
+  "pull hands back",
+  "tidy wire",
+  "hold harmonica",
+  "floating in water",
+  "bet on",
+  "introduction of pang ball",
+  "display ornaments",
+  "explanation skills",
+  "stand straight",
+  "erect canvas",
+  "dipping shoe oil",
+  "fooling around",
+  "measuring distance",
+  "display tools",
+  "nail nail",
+  "rotate",
+  "pre match preparation",
+  "wearing frog shoe",
+  "pose",
+  "smear shaving blister",
+  "send message",
+  "select curler",
+  "stake",
+  "prepare tools",
+  "cement",
+  "become biscuit",
+  "display product",
+  "display front door",
+  "showcase",
+  "water preparation",
+  "open mouth and sing",
+  "fall down",
+  "go fishing",
+  "gargle with fat man",
+  "spray liquid",
+  "on tree",
+  "cobra attack",
+  "feet shaking",
+  "bubble",
+  "use long stick",
+  "with boat overhead",
+  "camel stopped",
+  "replacement",
+  "prepare pigments",
+  "raise your right hand",
+  "blow out cigarette smoke",
+  "hand clenched fist",
+  "support body with two hands",
+  "frictional runway",
+  "scraping",
+  "clap with your teammates",
+  "talk to others",
+  "dry it",
+  "hand over nose",
+  "turn to landscape",
+  "horst hula hoop",
+  "ball hit the knee",
+  "steady and steady",
+  "swing",
+  "release pole",
+  "press button",
+  "wear glasses",
+  "start pruning trees",
+  "roll forward",
+  "spray cleaning solution",
+  "middle of tyres",
+  "selfie",
+  "tease hair",
+  "talk to screen",
+  "standing bow",
+  "climb up",
+  "stalemate",
+  "take down darts",
+  "high five",
+  "tidy",
+  "put down magic cube",
+  "scratching nose",
+  "dispel crowd",
+  "toothpaste",
+  "waving hand",
+  "bright smile",
+  "dogs before him",
+  "fixed toy",
+  "low down barycenter",
+  "warm up exercise",
+  "run forward",
+  "group photo",
+  "throw drumbeat",
+  "land on ground",
+  "lens conversion",
+  "satisfactory smile",
+  "erect violin",
+  "handshake",
+  "orange clothes male clenched fist",
+  "open",
+  "add two kites",
+  "weigh it down",
+  "kite flew high",
+  "pull off tires",
+  "black clothes people waved",
+  "test ball",
+  "rush to red cloth",
+  "hit himself",
+  "get blow out",
+  "kick off cushion",
+  "fish bite",
+  "bite frisbee",
+  "lift dog",
+  "applause",
+  "children shook hands",
+  "handstand",
+  "shake hand",
+  "picture change",
+  "try again",
+  "clap hand to celebrate",
+  "role pause",
+  "do somersault",
+  "pirouette",
+  "raise hand",
+  "carry out performance",
+  "pendulum poss",
+  "eat noodles",
+  "dry powder",
+  "drive to right",
+  "pick up phone",
+  "weeds are gone",
+  "right man stands up",
+  "stop recording",
+  "amuse dog",
+  "adults chipping potatoes",
+  "loose soil",
+  "trot",
+  "ship opened",
+  "put in plate",
+  "jump into pool",
+  "string string",
+  "draw on pumpkin",
+  "white male nodding",
+  "height of grass",
+  "pass ball",
+  "stop rowing",
+  "single hand support",
+  "hold down",
+  "go to machine",
+  "set up shelf",
+  "start weeding",
+  "wear hoop",
+  "tune stringed instrument",
+  "tape",
+  "be blocked",
+  "assembly of steel",
+  "relax with mobile phone",
+  "horse is moving",
+  "swivel",
+  "big mother",
+  "turn and half squat down",
+  "piled to corner",
+  "audience cheered",
+  "others come on field",
+  "lens deflection",
+  "put ball",
+  "in dialogue",
+  "aim",
+  "go through last",
+  "fist",
+  "putting camera",
+  "wiping car with foam",
+  "dance together",
+  "grasp parallel bars",
+  "squat down",
+  "shut down machine",
+  "climb stairs",
+  "washing spoon",
+  "dive preparation",
+  "touch lens",
+  "shake table",
+  "be smashed in drum",
+  "anchor jump rope",
+  "hit masses",
+  "see undersea biology",
+  "fall into river",
+  "cut sandwiches",
+  "horse nodding",
+  "put down flute",
+  "wash face",
+  "put on bread",
+  "display shoes",
+  "touch liquid",
+  "immerse in it",
+  "sit on sofa",
+  "leave court",
+  "to everyone ju gong",
+  "display finger",
+  "throw out again",
+  "turn picture back",
+  "middle finger",
+  "pressing on player",
+  "pick up racket",
+  "get up slowly",
+  "pick up cup",
+  "lose to opponent",
+  "backward dodge",
+  "concealed cards",
+  "get first",
+  "roll in sky",
+  "start car wash",
+  "grab railing",
+  "twist butt",
+  "demo bicycles",
+  "throw long pole",
+  "lift barbell",
+  "cross leg",
+  "wear socks",
+  "wipe hands",
+  "site",
+  "hold black clothes man",
+  "hit ball",
+  "kick ball out",
+  "skater surfers",
+  "continue to weave sweater",
+  "put cards in box",
+  "fiddle with hair",
+  "go away",
+  "refresh",
+  "psychological adjustment",
+  "join noodles",
+  "fall off stage",
+  "skidding",
+  "stir by spatula",
+  "someone is parachuting",
+  "fall off air cushion",
+  "finish up job",
+  "chasing people",
+  "spaghetti",
+  "bump into pot",
+  "continue knocking",
+  "ask for note",
+  "bow hairstyle",
+  "two people dance",
+  "cross hand",
+  "ball landed",
+  "dog chases plate",
+  "dog came back with something",
+  "be gone",
+  "attack",
+  "swing rod",
+  "raise your left hand",
+  "plaything",
+  "touch ones head",
+  "stand on bench",
+  "rouge lips",
+  "original takeoff",
+  "rinse",
+  "straighten rod",
+  "ironing sleeves",
+  "squating",
+  "start riding",
+  "open mouth ventilation",
+  "support yourself",
+  "dark bigger",
+  "light",
+  "mice",
+  "smear oil",
+  "lacquer",
+  "coated hair cream",
+  "throw away",
+  "dog jumped up",
+  "counterclockwise running",
+  "back to lens",
+  "push leaves off",
+  "shallow sea",
+  "long hair",
+  "straight",
+  "short sleeve",
+  "double fold eyelids",
+  "trousers",
+  "ripe",
+  "older",
+  "long sleeves",
+  "take off skirt",
+  "white men fall in water",
+  "marvellous",
+  "leg clip",
+  "tortoise",
+  "vigny",
+  "play drifting air cushion",
+  "parade in desert",
+  "wood brush",
+  "pulley slippery",
+  "musical instrument",
+  "noodle",
+  "skating canoe",
+  "diving in sea",
+  "street performance",
+  "street dance",
+  "tackle rubber rafts",
+  "dunk",
+  "on toilet",
+  "prepare for skiing",
+  "climb escalator",
+  "polo",
+  "uneven bars",
+  "play lacrosse",
+  "pruning shrubs",
+  "clean up garbage",
+  "sports competition",
+  "sliding skateboard",
+  "good afforested environment",
+  "displacement pressure",
+  "braid",
+  "play toy",
+  "chips",
+  "discus",
+  "air cushion",
+  "operation guidance",
+  "riding unicycle",
+  "rt",
+  "bids gun",
+  "gatekeepers",
+  "fork",
+  "paring knife",
+  "touch back of brain",
+  "shave dog",
+  "practice fencing",
+  "fencing training",
+  "press media",
+  "fencing",
+  "play bungee jumping",
+  "need to connect wire",
+  "hair of bun",
+  "sports boat",
+  "shoe polisher",
+  "white person",
+  "decorate some boxes",
+  "pet beauty",
+  "practice ballet",
+  "apple",
+  "many people are looking",
+  "shower",
+  "body coordination",
+  "advertisement",
+  "blue and green",
+  "blue and red",
+  "small stool",
+  "motorcycle",
+  "big",
+  "shirt",
+  "tourism advertisement",
+  "kelp",
+  "sea anemone",
+  "autumn",
+  "harmful",
+  "clever",
+  "very nice",
+  "jump down",
+  "narrow",
+  "shadow",
+  "much more difficult",
+  "ten meters",
+  "6 centimeters",
+  "more proficient",
+  "pretty good",
+  "adept",
+  "13",
+  "two",
+  "sixth",
+  "9.75",
+  "350",
+  "walking",
+  "standard",
+  "superb",
+  "torrential",
+  "skilled",
+  "wear exposure",
+  "sing in earnest",
+  "strong",
+  "cool",
+  "high and low",
+  "lovely",
+  "concise and clear",
+  "they hurt",
+  "skating rink",
+  "2011",
+  "usa",
+  "tie",
+  "small scale"
+]

ChatUniVi/eval/questions/video_qa/activitynet_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/consistency_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/generic_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/msrvtt_a_list.json ADDED Viewed

	@@ -0,0 +1,4175 @@

+[
+  "regard",
+  "show",
+  "explain",
+  "search",
+  "provide",
+  "computer",
+  "guy",
+  "something",
+  "someone",
+  "person",
+  "teach",
+  "exclaim",
+  "stand",
+  "learn",
+  "pose",
+  "move",
+  "pain",
+  "video",
+  "color",
+  "paint",
+  "animation",
+  "cartoon",
+  "talk",
+  "sit",
+  "wear",
+  "discus",
+  "man",
+  "room",
+  "health",
+  "suit",
+  "chair",
+  "fun",
+  "studio",
+  "write",
+  "teacher",
+  "chalkboard",
+  "problem",
+  "math",
+  "whiteboard",
+  "lesson",
+  "board",
+  "stage",
+  "girl",
+  "gameshow",
+  "judge",
+  "perform",
+  "female",
+  "laugh",
+  "entertain",
+  "dance",
+  "shout",
+  "friend",
+  "woman",
+  "demonstration",
+  "boy",
+  "song",
+  "male",
+  "sister",
+  "marble",
+  "play",
+  "put",
+  "mix",
+  "decorate",
+  "help",
+  "easter",
+  "egg",
+  "table",
+  "water",
+  "holder",
+  "polish",
+  "game",
+  "earth",
+  "minecraft",
+  "gamer",
+  "nether",
+  "break",
+  "mine",
+  "hammer",
+  "commentate",
+  "peson",
+  "rap",
+  "picture",
+  "voiceover",
+  "flash",
+  "rapper",
+  "two",
+  "change",
+  "throw",
+  "system",
+  "wall",
+  "office",
+  "camera",
+  "create",
+  "prepare",
+  "construct",
+  "craft",
+  "look",
+  "place",
+  "wrap",
+  "art",
+  "project",
+  "clay",
+  "leaf",
+  "dough",
+  "cake",
+  "player",
+  "baseball",
+  "ball",
+  "announcer",
+  "hit",
+  "call",
+  "team",
+  "outfielder",
+  "fashion",
+  "walk",
+  "fasion",
+  "ailse",
+  "runway",
+  "catwalk",
+  "clothing",
+  "display",
+  "pat",
+  "use",
+  "make",
+  "foundation",
+  "lady",
+  "face",
+  "clean",
+  "bath",
+  "tub",
+  "toy",
+  "brush",
+  "cat",
+  "orange",
+  "scrubber",
+  "child",
+  "relax",
+  "swim",
+  "lake",
+  "field",
+  "dres",
+  "beach",
+  "gras",
+  "car",
+  "vehicle",
+  "introduce",
+  "drive",
+  "describe",
+  "review",
+  "movie",
+  "load",
+  "speak",
+  "watch",
+  "begin",
+  "tap",
+  "space",
+  "bench",
+  "glimpse",
+  "screen",
+  "globe",
+  "planet",
+  "world",
+  "explore",
+  "backstreet",
+  "restaurant",
+  "divider",
+  "asium",
+  "china",
+  "city",
+  "band",
+  "road",
+  "singer",
+  "pick",
+  "phone",
+  "telephone",
+  "taunt",
+  "squidward",
+  "spongebob",
+  "character",
+  "scene",
+  "half",
+  "group",
+  "instructor",
+  "floor",
+  "instruction",
+  "ballroom",
+  "clas",
+  "jog",
+  "tell",
+  "exercise",
+  "three",
+  "circle",
+  "mother",
+  "mom",
+  "family",
+  "jogging",
+  "skill",
+  "hardware",
+  "piece",
+  "detail",
+  "mechanic",
+  "item",
+  "demonstrate",
+  "work",
+  "repair",
+  "side",
+  "way",
+  "front",
+  "body",
+  "coach",
+  "athlete",
+  "angle",
+  "activity",
+  "present",
+  "include",
+  "tos",
+  "baby",
+  "hold",
+  "transition",
+  "youtuber",
+  "segment",
+  "individual",
+  "news",
+  "doll",
+  "subject",
+  "selfie",
+  "leg",
+  "shake",
+  "spread",
+  "blue",
+  "lighting",
+  "beam",
+  "warcraft",
+  "garden",
+  "house",
+  "island",
+  "roof",
+  "record",
+  "building",
+  "football",
+  "glas",
+  "spin",
+  "basket",
+  "music",
+  "program",
+  "basketball",
+  "sport",
+  "food",
+  "kitchenthere",
+  "liquid",
+  "fry",
+  "chef",
+  "crab",
+  "kitchen",
+  "boil",
+  "melt",
+  "stir",
+  "model",
+  "diagram",
+  "draw",
+  "party",
+  "bedroom",
+  "eye",
+  "bos",
+  "bigg",
+  "advertisement",
+  "hug",
+  "blink",
+  "google",
+  "narrator",
+  "classy",
+  "driving",
+  "lean",
+  "newsreporter",
+  "cameraman",
+  "photographer",
+  "shoot",
+  "photography",
+  "burn",
+  "alternate",
+  "fire",
+  "pink",
+  "hang",
+  "blow",
+  "visit",
+  "handbag",
+  "horn",
+  "shopping",
+  "market",
+  "bed",
+  "telecast",
+  "conversation",
+  "wife",
+  "lay",
+  "lie",
+  "scold",
+  "shirtles",
+  "lounge",
+  "saurkraut",
+  "sauerkraut",
+  "dish",
+  "plate",
+  "eat",
+  "serve",
+  "stroller",
+  "fold",
+  "push",
+  "sidewalk",
+  "demo",
+  "trolley",
+  "boner",
+  "narrate",
+  "argue",
+  "startle",
+  "princes",
+  "voice",
+  "leah",
+  "confirm",
+  "animalist",
+  "mirror",
+  "test",
+  "experience",
+  "drag",
+  "embrace",
+  "lion",
+  "react",
+  "enjoy",
+  "fly",
+  "nyancat",
+  "rainbow",
+  "machine",
+  "robot",
+  "issue",
+  "equipment",
+  "astronaut",
+  "try",
+  "attempt",
+  "intercept",
+  "pilot",
+  "observe",
+  "negotiate",
+  "operate",
+  "ask",
+  "landing",
+  "eachother",
+  "matthew",
+  "procedure",
+  "spaceship",
+  "cook",
+  "vest",
+  "stove",
+  "distres",
+  "ground",
+  "emergency",
+  "shoplift",
+  "mob",
+  "crowd",
+  "panic",
+  "wolf",
+  "scream",
+  "bear",
+  "interact",
+  "joke",
+  "animal",
+  "monkey",
+  "shirt",
+  "sign",
+  "bird",
+  "hand",
+  "carry",
+  "rest",
+  "meow",
+  "lift",
+  "parrotlove",
+  "parrot",
+  "top",
+  "film",
+  "cage",
+  "rescue",
+  "introduction",
+  "extinguisher",
+  "flame",
+  "float",
+  "aquarium",
+  "tank",
+  "gold",
+  "plant",
+  "fish",
+  "merchandise",
+  "purse",
+  "wallet",
+  "merchandising",
+  "stephen",
+  "london",
+  "reminisce",
+  "advertise",
+  "product",
+  "sale",
+  "desk",
+  "country",
+  "violin",
+  "childhood",
+  "soccer",
+  "involve",
+  "drift",
+  "preform",
+  "applause",
+  "language",
+  "star",
+  "comment",
+  "paper",
+  "pencil",
+  "drawl",
+  "lap",
+  "mouse",
+  "rugby",
+  "rant",
+  "contract",
+  "quarterback",
+  "webcamera",
+  "professor",
+  "college",
+  "cloth",
+  "microphone",
+  "balance",
+  "bounce",
+  "gym",
+  "dvd",
+  "leaflet",
+  "kid",
+  "pamphlet",
+  "event",
+  "sync",
+  "night",
+  "carriage",
+  "adjust",
+  "carrier",
+  "act",
+  "audience",
+  "concert",
+  "sin",
+  "performance",
+  "rock",
+  "webcam",
+  "trailer",
+  "browse",
+  "download",
+  "youtube",
+  "internet",
+  "page",
+  "mp3",
+  "war",
+  "guitar",
+  "tie",
+  "cheer",
+  "coil",
+  "loop",
+  "fishing",
+  "wire",
+  "turkey",
+  "drink",
+  "grill",
+  "meat",
+  "cooking",
+  "sizzle",
+  "tawa",
+  "pot",
+  "rice",
+  "sequence",
+  "stone",
+  "answer",
+  "incident",
+  "cam",
+  "store",
+  "supermarket",
+  "set",
+  "sail",
+  "cruise",
+  "vacation",
+  "sea",
+  "boat",
+  "ship",
+  "sand",
+  "ocean",
+  "landscape",
+  "mountain",
+  "dog",
+  "deputy",
+  "community",
+  "polouse",
+  "contestant",
+  "idol",
+  "panel",
+  "performer",
+  "reality",
+  "advice",
+  "crossbow",
+  "bow",
+  "arrow",
+  "practice",
+  "patrol",
+  "report",
+  "rail",
+  "mention",
+  "read",
+  "swat",
+  "newscaster",
+  "security",
+  "protection",
+  "crimea",
+  "guard",
+  "area",
+  "polymer",
+  "knife",
+  "bit",
+  "hobby",
+  "point",
+  "touch",
+  "design",
+  "artist",
+  "shape",
+  "poke",
+  "matt",
+  "briefly",
+  "damon",
+  "keyboard",
+  "animator",
+  "history",
+  "type",
+  "clipping",
+  "celebrity",
+  "birthday",
+  "smile",
+  "thor",
+  "shooting",
+  "rain",
+  "outdoor",
+  "pull",
+  "lego",
+  "pajama",
+  "clip",
+  "amna",
+  "interview",
+  "conduct",
+  "feature",
+  "choice",
+  "award",
+  "function",
+  "announce",
+  "seat",
+  "roadway",
+  "travel",
+  "window",
+  "sleep",
+  "whistle",
+  "doghouse",
+  "spawn",
+  "setting",
+  "random",
+  "fence",
+  "skating",
+  "teenager",
+  "skateboard",
+  "skate",
+  "text",
+  "spiral",
+  "shoe",
+  "turn",
+  "rotate",
+  "dna",
+  "information",
+  "strand",
+  "junk",
+  "helix",
+  "explanation",
+  "suv",
+  "fur",
+  "horse",
+  "shanium",
+  "head",
+  "rush",
+  "town",
+  "disney",
+  "four",
+  "barbie",
+  "dancing",
+  "diagnose",
+  "share",
+  "industry",
+  "air",
+  "racer",
+  "monitor",
+  "race",
+  "hood",
+  "patrick",
+  "sponge",
+  "snail",
+  "driveway",
+  "street",
+  "hoop",
+  "practise",
+  "sparrow",
+  "box",
+  "contraption",
+  "chirp",
+  "scratch",
+  "climb",
+  "peck",
+  "plot",
+  "mystery",
+  "detective",
+  "investigate",
+  "doorway",
+  "line",
+  "giggle",
+  "culture",
+  "cover",
+  "send",
+  "knock",
+  "broadcast",
+  "neck",
+  "hide",
+  "father",
+  "cliff",
+  "stewie",
+  "griffen",
+  "feel",
+  "stay",
+  "kis",
+  "breast",
+  "chest",
+  "porch",
+  "door",
+  "walkway",
+  "factory",
+  "development",
+  "span",
+  "logo",
+  "vision",
+  "ryanvision",
+  "motion",
+  "newsanchor",
+  "celebrate",
+  "commentator",
+  "steer",
+  "speed",
+  "driver",
+  "traffic",
+  "commentary",
+  "thing",
+  "extension",
+  "cellphone",
+  "mobile",
+  "chat",
+  "message",
+  "creature",
+  "whale",
+  "giant",
+  "surface",
+  "jump",
+  "approach",
+  "emerge",
+  "breach",
+  "dive",
+  "track",
+  "shot",
+  "starting",
+  "start",
+  "runner",
+  "compare",
+  "busines",
+  "campu",
+  "fist",
+  "school",
+  "book",
+  "reading",
+  "classroom",
+  "amazon",
+  "atmosphere",
+  "cave",
+  "farm",
+  "ice",
+  "frost",
+  "wed",
+  "cream",
+  "murray",
+  "bill",
+  "substance",
+  "park",
+  "salesman",
+  "aman",
+  "sell",
+  "lenin",
+  "education",
+  "bowl",
+  "recipe",
+  "pmy",
+  "time",
+  "nobody",
+  "pretty",
+  "hair",
+  "blonde",
+  "stuffed",
+  "femal",
+  "figure",
+  "goat",
+  "stream",
+  "mod",
+  "spa",
+  "rub",
+  "massage",
+  "wash",
+  "milk",
+  "economy",
+  "leader",
+  "money",
+  "raise",
+  "crawl",
+  "slide",
+  "fall",
+  "stair",
+  "cowboy",
+  "bacon",
+  "barbecue",
+  "flower",
+  "cheif",
+  "meal",
+  "kithen",
+  "treat",
+  "preview",
+  "dubstep",
+  "wave",
+  "view",
+  "background",
+  "batting",
+  "batsman",
+  "tsunami",
+  "bat",
+  "photo",
+  "one",
+  "pirate",
+  "sword",
+  "transportation",
+  "river",
+  "waterfall",
+  "gush",
+  "wrestle",
+  "wrestling",
+  "match",
+  "competition",
+  "fight",
+  "microwave",
+  "oven",
+  "inhome",
+  "heat",
+  "wooden",
+  "mud",
+  "exit",
+  "compete",
+  "moody",
+  "tom",
+  "pokeman",
+  "adventure",
+  "togepus",
+  "pokemon",
+  "hat",
+  "television",
+  "ash",
+  "debate",
+  "podium",
+  "cnn",
+  "speaker",
+  "speechles",
+  "cruz",
+  "recap",
+  "somethimg",
+  "letterman",
+  "leterman",
+  "actor",
+  "host",
+  "couch",
+  "doze",
+  "airplane",
+  "plane",
+  "gray",
+  "beauty",
+  "love",
+  "volleyball",
+  "interest",
+  "cry",
+  "haras",
+  "crow",
+  "attack",
+  "knight",
+  "sky",
+  "catch",
+  "maiden",
+  "soldier",
+  "warrior",
+  "oil",
+  "origami",
+  "cheese",
+  "silver",
+  "home",
+  "software",
+  "application",
+  "discussion",
+  "ring",
+  "sink",
+  "nurse",
+  "wipe",
+  "dip",
+  "deal",
+  "target",
+  "britney",
+  "conference",
+  "metal",
+  "medal",
+  "chi",
+  "die",
+  "daughter",
+  "rinse",
+  "build",
+  "ride",
+  "gondola",
+  "snow",
+  "hill",
+  "rope",
+  "lft",
+  "monorail",
+  "chairlift",
+  "peeple",
+  "member",
+  "harry",
+  "flight",
+  "moon",
+  "mission",
+  "parody",
+  "gangnam",
+  "gun",
+  "cortona",
+  "outline",
+  "pack",
+  "bod",
+  "charachter",
+  "heckle",
+  "tenni",
+  "pong",
+  "end",
+  "cardio",
+  "attend",
+  "rear",
+  "win",
+  "jockey",
+  "macdonald",
+  "wish",
+  "greet",
+  "meet",
+  "shuttle",
+  "iss",
+  "science",
+  "dusk",
+  "couple",
+  "doctor",
+  "tardi",
+  "cape",
+  "salad",
+  "chop",
+  "cut",
+  "gear",
+  "machinery",
+  "court",
+  "pace",
+  "cort",
+  "cool",
+  "bald",
+  "trick",
+  "spec",
+  "communist",
+  "museum",
+  "hangar",
+  "laptop",
+  "lab",
+  "cup",
+  "spuid",
+  "octopu",
+  "bottle",
+  "jellyfish",
+  "squid",
+  "dirge",
+  "lessen",
+  "anthem",
+  "shift",
+  "superhero",
+  "foam",
+  "fit",
+  "instal",
+  "fix",
+  "mat",
+  "cusion",
+  "construction",
+  "cushion",
+  "material",
+  "celeberatey",
+  "trophy",
+  "appreciate",
+  "cute",
+  "pas",
+  "lead",
+  "gameplay",
+  "user",
+  "choose",
+  "enter",
+  "name",
+  "cop",
+  "brittany",
+  "life",
+  "peek",
+  "dark",
+  "strum",
+  "guitarist",
+  "gunfire",
+  "syrium",
+  "firefight",
+  "helicopter",
+  "isi",
+  "army",
+  "smoke",
+  "terror",
+  "bombing",
+  "reporter",
+  "bomb",
+  "engine",
+  "garry",
+  "lightyear",
+  "receipe",
+  "flmy",
+  "tutorial",
+  "smell",
+  "taste",
+  "eubank",
+  "fighter",
+  "recall",
+  "spinach",
+  "lasagna",
+  "object",
+  "ply",
+  "skin",
+  "juice",
+  "tiki",
+  "ferrai",
+  "formula",
+  "racing",
+  "comantery",
+  "boyfriend",
+  "threee",
+  "talent",
+  "hear",
+  "confront",
+  "rubber",
+  "layout",
+  "instruct",
+  "pop",
+  "dinner",
+  "date",
+  "clock",
+  "tick",
+  "round",
+  "earthquake",
+  "scientist",
+  "talkshow",
+  "boob",
+  "drop",
+  "dirt",
+  "mes",
+  "instrument",
+  "service",
+  "foot",
+  "opening",
+  "article",
+  "guest",
+  "homer",
+  "light",
+  "technology",
+  "partner",
+  "ches",
+  "consume",
+  "bankruptcy",
+  "nick",
+  "treehouse",
+  "bike",
+  "anime",
+  "sponngebob",
+  "annoy",
+  "tease",
+  "taxi",
+  "method",
+  "gadget",
+  "gentleman",
+  "customize",
+  "proces",
+  "terrify",
+  "creep",
+  "reach",
+  "claw",
+  "ceiling",
+  "flow",
+  "scroll",
+  "invade",
+  "newspaper",
+  "writer",
+  "document",
+  "gambling",
+  "gamble",
+  "punch",
+  "alert",
+  "letter",
+  "fiskal",
+  "magentum",
+  "block",
+  "pig",
+  "concern",
+  "hilary",
+  "hillary",
+  "clinton",
+  "confederate",
+  "speech",
+  "politician",
+  "struggle",
+  "standing",
+  "vocalist",
+  "playground",
+  "stadium",
+  "scrum",
+  "result",
+  "handle",
+  "goal",
+  "lasso",
+  "mansion",
+  "benson",
+  "operation",
+  "kick",
+  "vlog",
+  "lecture",
+  "lecturer",
+  "biology",
+  "deliver",
+  "meeting",
+  "newsman",
+  "hospital",
+  "package",
+  "patient",
+  "recovery",
+  "recipiesaresimple",
+  "gravy",
+  "soup",
+  "appeal",
+  "scoop",
+  "fill",
+  "conclude",
+  "desert",
+  "smash",
+  "demolish",
+  "truck",
+  "crash",
+  "coat",
+  "homeles",
+  "happiness",
+  "luggage",
+  "think",
+  "part",
+  "tool",
+  "pool",
+  "july",
+  "base",
+  "run",
+  "kneel",
+  "score",
+  "mis",
+  "diamond",
+  "bag",
+  "root",
+  "magnify",
+  "jewel",
+  "heroin",
+  "longingly",
+  "hero",
+  "chapter",
+  "peer",
+  "tshirt",
+  "paperwork",
+  "booty",
+  "comb",
+  "hose",
+  "educate",
+  "pitcher",
+  "gravity",
+  "experiment",
+  "bucket",
+  "freezing",
+  "tray",
+  "sweet",
+  "storage",
+  "pureed",
+  "container",
+  "potaato",
+  "freeze",
+  "brother",
+  "coddle",
+  "pet",
+  "ldy",
+  "yard",
+  "calm",
+  "fortres",
+  "till",
+  "thief",
+  "viewer",
+  "feedback",
+  "contestent",
+  "competitor",
+  "hiliary",
+  "cspan",
+  "hatdoll",
+  "lavigne",
+  "advertiesment",
+  "glouse",
+  "crown",
+  "government",
+  "peformance",
+  "photograph",
+  "towel",
+  "buffalo",
+  "universe",
+  "lense",
+  "cell",
+  "fetu",
+  "nyc",
+  "ramp",
+  "goldfish",
+  "destroy",
+  "videogame",
+  "dig",
+  "hell",
+  "bake",
+  "vinegar",
+  "ambulance",
+  "highway",
+  "beat",
+  "kill",
+  "semolina",
+  "thee",
+  "fart",
+  "phrase",
+  "challenge",
+  "herself",
+  "profanity",
+  "obama",
+  "president",
+  "steve",
+  "depict",
+  "researcher",
+  "horror",
+  "barber",
+  "converse",
+  "sweeny",
+  "chemistry",
+  "state",
+  "reader",
+  "medicine",
+  "lamma",
+  "gress",
+  "alpaca",
+  "enclosure",
+  "pin",
+  "lama",
+  "pen",
+  "pram",
+  "device",
+  "smartphone",
+  "app",
+  "troop",
+  "goop",
+  "offer",
+  "channel",
+  "makeup",
+  "montage",
+  "pole",
+  "slideshow",
+  "designer",
+  "candy",
+  "prank",
+  "finger",
+  "swear",
+  "tmz",
+  "topic",
+  "navigate",
+  "yellow",
+  "garbage",
+  "carpet",
+  "roll",
+  "maintenance",
+  "janitor",
+  "lunchroom",
+  "apple",
+  "cafeterium",
+  "bite",
+  "mop",
+  "stuff",
+  "parade",
+  "scooter",
+  "orchestra",
+  "carnival",
+  "standstill",
+  "vintage",
+  "control",
+  "playing",
+  "backround",
+  "roar",
+  "pretend",
+  "cpr",
+  "wedding",
+  "tangle",
+  "earbud",
+  "lid",
+  "sandhole",
+  "train",
+  "crime",
+  "denial",
+  "restrain",
+  "file",
+  "canal",
+  "applaud",
+  "clap",
+  "dad",
+  "story",
+  "vigil",
+  "newscastor",
+  "newswoman",
+  "volley",
+  "neon",
+  "muppet",
+  "king",
+  "theater",
+  "pan",
+  "form",
+  "mold",
+  "mini",
+  "spoon",
+  "motorcycle",
+  "motorbike",
+  "freeway",
+  "picnic",
+  "finn",
+  "range",
+  "engineering",
+  "encourage",
+  "slam",
+  "gate",
+  "tale",
+  "skier",
+  "represent",
+  "surf",
+  "surfboard",
+  "snowboard",
+  "cloud",
+  "arrange",
+  "boo",
+  "disco",
+  "singing",
+  "correspond",
+  "gogh",
+  "screnn",
+  "bernard",
+  "premiere",
+  "care",
+  "salon",
+  "saloon",
+  "forest",
+  "arm",
+  "treatment",
+  "van",
+  "vanwhile",
+  "msn",
+  "dalmation",
+  "pub",
+  "rumor",
+  "explode",
+  "yell",
+  "firework",
+  "dashboard",
+  "stereo",
+  "honda",
+  "robbie",
+  "advocate",
+  "birth",
+  "rat",
+  "hamster",
+  "rodent",
+  "owner",
+  "sniff",
+  "comfort",
+  "attach",
+  "remodel",
+  "glue",
+  "interior",
+  "umpire",
+  "tumble",
+  "become",
+  "collage",
+  "ipod",
+  "comedy",
+  "snake",
+  "shadow",
+  "collar",
+  "frame",
+  "master",
+  "dinosaur",
+  "dinosor",
+  "print",
+  "carfax",
+  "ralley",
+  "tournament",
+  "kium",
+  "teni",
+  "amaze",
+  "compilation",
+  "spar",
+  "marshal",
+  "order",
+  "camel",
+  "auburn",
+  "mustard",
+  "murrey",
+  "jar",
+  "festival",
+  "firearm",
+  "handgun",
+  "magazine",
+  "insert",
+  "surround",
+  "flirt",
+  "cafe",
+  "cupid",
+  "contain",
+  "footage",
+  "wait",
+  "structure",
+  "left",
+  "portrait",
+  "sun",
+  "revolve",
+  "hallway",
+  "glitter",
+  "command",
+  "officer",
+  "solider",
+  "formation",
+  "computergame",
+  "strategy",
+  "puzzle",
+  "memorize",
+  "broth",
+  "bowel",
+  "chicken",
+  "leveler",
+  "journalist",
+  "pitch",
+  "occur",
+  "routine",
+  "oatmeal",
+  "ballad",
+  "listen",
+  "audition",
+  "toilet",
+  "flush",
+  "commode",
+  "fainting",
+  "needle",
+  "faint",
+  "spell",
+  "chase",
+  "pit",
+  "hole",
+  "scooby",
+  "asteroid",
+  "lunar",
+  "impact",
+  "streamer",
+  "growth",
+  "antelope",
+  "technique",
+  "churro",
+  "countertop",
+  "counter",
+  "steam",
+  "mood",
+  "breakfast",
+  "scowl",
+  "facebook",
+  "research",
+  "examine",
+  "paleontologist",
+  "mascara",
+  "hottie",
+  "camouflage",
+  "atv",
+  "smokey",
+  "zombie",
+  "teenage",
+  "aim",
+  "masala",
+  "frying",
+  "village",
+  "crocodile",
+  "aligator",
+  "alligator",
+  "paste",
+  "screw",
+  "gum",
+  "pipe",
+  "gel",
+  "gumbo",
+  "hop",
+  "complain",
+  "tan",
+  "lotion",
+  "toples",
+  "taboo",
+  "day",
+  "bikini",
+  "skillet",
+  "pancake",
+  "zoom",
+  "destination",
+  "option",
+  "capture",
+  "zooming",
+  "execise",
+  "gymanasium",
+  "integrate",
+  "guide",
+  "ryan",
+  "progress",
+  "crazy",
+  "mike",
+  "russium",
+  "vladmir",
+  "vladimir",
+  "addres",
+  "policy",
+  "gown",
+  "spirit",
+  "shop",
+  "journey",
+  "fog",
+  "hunt",
+  "peak",
+  "checkerd",
+  "fault",
+  "alien",
+  "thank",
+  "bal",
+  "iphone",
+  "guinea",
+  "bathroom",
+  "dry",
+  "button",
+  "chilli",
+  "infusion",
+  "slouse",
+  "fanlala",
+  "airport",
+  "hindu",
+  "marriage",
+  "piccolo",
+  "dragonball",
+  "dome",
+  "landmark",
+  "suggest",
+  "tunisium",
+  "danger",
+  "obstacle",
+  "lot",
+  "simple",
+  "earring",
+  "ear",
+  "nasa",
+  "graph",
+  "microsoft",
+  "spreadsheet",
+  "somebody",
+  "parker",
+  "girlfriend",
+  "focu",
+  "indium",
+  "netrus",
+  "ladle",
+  "aid",
+  "university",
+  "rifle",
+  "flatter",
+  "bulldozer",
+  "excavator",
+  "vampire",
+  "fiddle",
+  "israeli",
+  "isreal",
+  "infiltrate",
+  "amusement",
+  "sketch",
+  "seasoning",
+  "salt",
+  "pepper",
+  "steak",
+  "season",
+  "course",
+  "return",
+  "collect",
+  "superb",
+  "wisconsin",
+  "nature",
+  "valley",
+  "hike",
+  "trouble",
+  "failure",
+  "excise",
+  "vertacle",
+  "basketballer",
+  "volcano",
+  "map",
+  "satellite",
+  "sprinkle",
+  "verde",
+  "braid",
+  "africa",
+  "kenya",
+  "interviewer",
+  "gymnasium",
+  "dodge",
+  "automobile",
+  "beard",
+  "title",
+  "beneath",
+  "circket",
+  "rockstar",
+  "joan",
+  "dave",
+  "jam",
+  "interface",
+  "screencast",
+  "coversation",
+  "five-ing",
+  "mltiple",
+  "smack",
+  "nadium",
+  "stretcher",
+  "overlap",
+  "action",
+  "teaser",
+  "ledge",
+  "overlook",
+  "await",
+  "rooftop",
+  "invite",
+  "bbc",
+  "mall",
+  "arrest",
+  "dhabi",
+  "conflict",
+  "suspect",
+  "critique",
+  "expres",
+  "buy",
+  "diver",
+  "scuba",
+  "trunk",
+  "bolt",
+  "dream",
+  "gymnast",
+  "forward",
+  "bar",
+  "auditorium",
+  "list",
+  "ingredient",
+  "pale",
+  "tackle",
+  "support",
+  "beautiful",
+  "back",
+  "bedsheet",
+  "mincraft",
+  "emerald",
+  "job",
+  "york",
+  "harlow",
+  "tomorrow",
+  "study",
+  "sofa",
+  "furniture",
+  "saleman",
+  "beef",
+  "molecule",
+  "chip",
+  "check",
+  "crew",
+  "director",
+  "warfield",
+  "shaq",
+  "balding",
+  "microstructure",
+  "web",
+  "website",
+  "checkmark",
+  "relevant",
+  "ethopium",
+  "risk",
+  "metallica",
+  "battlefield",
+  "mario",
+  "receive",
+  "anchorwoman",
+  "gay",
+  "hello",
+  "futbol",
+  "hollywood",
+  "gtum",
+  "breakdancing",
+  "unbox",
+  "plastic",
+  "drown",
+  "surfer",
+  "drug",
+  "gain",
+  "flip",
+  "transfer",
+  "reflect",
+  "colorful",
+  "tbale",
+  "bridge",
+  "scenery",
+  "coke",
+  "soda",
+  "cola",
+  "saucepan",
+  "stovetop",
+  "coca",
+  "moras",
+  "dessert",
+  "oniontomato",
+  "materialism",
+  "satan",
+  "televison",
+  "atom",
+  "rectangle",
+  "coupe",
+  "damage",
+  "optimu",
+  "butter",
+  "vegetable",
+  "caramelize",
+  "strut",
+  "stroll",
+  "lingerie",
+  "protrude",
+  "bleed",
+  "bone",
+  "audtioner",
+  "safety",
+  "charge",
+  "sniper",
+  "brunette",
+  "eva",
+  "environment",
+  "thought",
+  "handsom",
+  "foul",
+  "murder",
+  "closet",
+  "wearing",
+  "sticker",
+  "transmit",
+  "tape",
+  "glove",
+  "nose",
+  "stroke",
+  "hairstyle",
+  "actres",
+  "isabella",
+  "underworld",
+  "projector",
+  "shark",
+  "sauce",
+  "executive",
+  "spicy",
+  "plat",
+  "curry",
+  "guidance",
+  "candle",
+  "seance",
+  "hall",
+  "peanut",
+  "chocolate",
+  "toffee",
+  "movement",
+  "wake",
+  "policeman",
+  "investigation",
+  "reel",
+  "fisherman",
+  "skateboarder",
+  "racecar",
+  "racetrack",
+  "banister",
+  "trail",
+  "glider",
+  "sexy",
+  "quest",
+  "captain",
+  "blond",
+  "shampoo",
+  "grooming",
+  "sherrus",
+  "steel",
+  "cros",
+  "tree",
+  "word",
+  "onion",
+  "hairanother",
+  "groom",
+  "eckersley",
+  "magic",
+  "cuisine",
+  "customer",
+  "necklace",
+  "tube",
+  "straighten",
+  "wind",
+  "chain",
+  "benefit",
+  "sentence",
+  "bottom",
+  "chow",
+  "bottl",
+  "juggeling",
+  "juggle",
+  "juggling",
+  "mixer",
+  "processor",
+  "mixture",
+  "whole",
+  "crush",
+  "grind",
+  "melon",
+  "promote",
+  "company",
+  "queen",
+  "mustache",
+  "noise",
+  "goofy",
+  "stomach",
+  "cheek",
+  "blush",
+  "tattooed",
+  "undergarment",
+  "heart",
+  "violent",
+  "strength",
+  "darknes",
+  "creepy",
+  "collision",
+  "jennifer",
+  "converstaion",
+  "nutrition",
+  "sweater",
+  "illustrate",
+  "grapple",
+  "referee",
+  "victory",
+  "expansion",
+  "carpenter",
+  "trancition",
+  "nail",
+  "fisher",
+  "translate",
+  "shoulder",
+  "presentation",
+  "sex",
+  "tea",
+  "sip",
+  "turtle",
+  "corner",
+  "babt",
+  "pond",
+  "modeling",
+  "paddle",
+  "bye",
+  "drench",
+  "cleaning",
+  "ment",
+  "synch",
+  "overhead",
+  "pingpong",
+  "olympic",
+  "exciting",
+  "weekday",
+  "lettering",
+  "harrison",
+  "bay",
+  "flex",
+  "lifting",
+  "weight",
+  "wayne",
+  "aeroplane",
+  "tractor",
+  "rider",
+  "horseback",
+  "lamborghini",
+  "running",
+  "playgrond",
+  "simulate",
+  "otehr",
+  "speach",
+  "barack",
+  "nation",
+  "ohio",
+  "motherand",
+  "affection",
+  "laughing",
+  "ballet",
+  "terrific",
+  "composition",
+  "choreographer",
+  "zipline",
+  "hairdryer",
+  "gelatin",
+  "texture",
+  "dryer",
+  "wrestler",
+  "arena",
+  "fool",
+  "fail",
+  "spaciousnes",
+  "reveal",
+  "corridor",
+  "dermatologist",
+  "charity",
+  "rosacea",
+  "cancer",
+  "teaching",
+  "equation",
+  "factor",
+  "spider",
+  "roleplay",
+  "villain",
+  "blind",
+  "fury",
+  "loki",
+  "patch",
+  "instrumental",
+  "tone",
+  "calender",
+  "alarm",
+  "calendar",
+  "torch",
+  "flashlight",
+  "christmastime",
+  "mannequin",
+  "wander",
+  "brighter",
+  "avatar",
+  "combat",
+  "motionles",
+  "iran",
+  "roam",
+  "steal",
+  "documentary",
+  "hyena",
+  "pester",
+  "anchor",
+  "summer",
+  "jesu",
+  "chant",
+  "mingle",
+  "hotel",
+  "elevator",
+  "joseph",
+  "mic",
+  "widget",
+  "pip",
+  "jeep",
+  "absorber",
+  "staircase",
+  "wood",
+  "improvement",
+  "flooring",
+  "floring",
+  "plywood",
+  "deck",
+  "drool",
+  "toaster",
+  "puree",
+  "simmer",
+  "refer",
+  "tab",
+  "tablet",
+  "library",
+  "youngster",
+  "smart",
+  "six",
+  "purple",
+  "pacify",
+  "motorway",
+  "sedan",
+  "spotlight",
+  "grease",
+  "mosquito",
+  "scope",
+  "announcement",
+  "praise",
+  "sputnik",
+  "degras",
+  "convention",
+  "tyson",
+  "unscrew",
+  "unbolt",
+  "instrunment",
+  "shin",
+  "pony",
+  "choreograph",
+  "fountain",
+  "brown",
+  "harvey",
+  "someine",
+  "drawing",
+  "joy",
+  "matter",
+  "draft",
+  "raft",
+  "pontoon",
+  "reef",
+  "micheal",
+  "cuddle",
+  "vehicale",
+  "latitude",
+  "skincare",
+  "inside",
+  "confession",
+  "youth",
+  "tower",
+  "church",
+  "lover",
+  "step",
+  "rob",
+  "lorry",
+  "firefighter",
+  "firetruck",
+  "webpage",
+  "troubleshoot",
+  "distance",
+  "direction",
+  "maintain",
+  "cube",
+  "rubik",
+  "ruby",
+  "rubix",
+  "solve",
+  "cookery",
+  "guillotine",
+  "mist",
+  "mask",
+  "bing",
+  "position",
+  "tmy",
+  "pastry",
+  "powder",
+  "unicorn",
+  "layup",
+  "middle",
+  "sounding",
+  "sound",
+  "winter",
+  "santum",
+  "christma",
+  "timberlake",
+  "justin",
+  "creek",
+  "tourist",
+  "ram",
+  "ribeye",
+  "foil",
+  "harmonica",
+  "sandwich",
+  "sandwhich",
+  "pregnancy",
+  "brightnes",
+  "fiction",
+  "age",
+  "center",
+  "gathering",
+  "alcohol",
+  "wireles",
+  "radio",
+  "volume",
+  "aguy",
+  "coffee",
+  "buzz",
+  "dial",
+  "ventura",
+  "campaign",
+  "collection",
+  "super",
+  "governor",
+  "christie",
+  "sunday",
+  "meghan",
+  "poll",
+  "chri",
+  "caller",
+  "ski",
+  "snowy",
+  "bull",
+  "clown",
+  "jury",
+  "champion",
+  "programm",
+  "gallery",
+  "booth",
+  "icon",
+  "flowerpot",
+  "bassinet",
+  "krueger",
+  "kruger",
+  "kreuger",
+  "freddy",
+  "fatality",
+  "gladiator",
+  "lipstick",
+  "amore",
+  "brand",
+  "lipglos",
+  "flag",
+  "lawn",
+  "rev",
+  "site",
+  "enemy",
+  "bent",
+  "dash",
+  "riffle",
+  "grow",
+  "cohabit",
+  "tiger",
+  "mouth",
+  "edit",
+  "adobe",
+  "photoshop",
+  "manager",
+  "romanium",
+  "ancient",
+  "vine",
+  "opponent",
+  "dunk",
+  "hipster",
+  "energy",
+  "monster",
+  "tribute",
+  "cabinet",
+  "dresser",
+  "twirl",
+  "sporting",
+  "golf",
+  "golfer",
+  "shine",
+  "choir",
+  "gospel",
+  "partake",
+  "smosh",
+  "timey",
+  "colonial",
+  "multiple",
+  "multiplication",
+  "fruit",
+  "lotu",
+  "potato",
+  "peel",
+  "scrap",
+  "curve",
+  "acceleration",
+  "garage",
+  "cap",
+  "demand",
+  "ghost",
+  "drummer",
+  "breathe",
+  "effect",
+  "image",
+  "frock",
+  "marketing",
+  "hawk",
+  "makeshift",
+  "skeleton",
+  "graveyard",
+  "cemetery",
+  "mulitple",
+  "oeioke",
+  "protest",
+  "reply",
+  "protester",
+  "contest",
+  "cobweb",
+  "dump",
+  "drama",
+  "airliner",
+  "staff",
+  "wing",
+  "wipeout",
+  "toddler",
+  "refresh",
+  "swimming",
+  "ripple",
+  "garlic",
+  "feed",
+  "nuzzle",
+  "swimsuit",
+  "mexico",
+  "succes",
+  "communicate",
+  "asses",
+  "treasure",
+  "stun",
+  "trader",
+  "throat",
+  "perish",
+  "kissing",
+  "revenge",
+  "meatloaf",
+  "stick",
+  "gather",
+  "glistening",
+  "competion",
+  "net",
+  "filter",
+  "spray",
+  "inspect",
+  "gsr",
+  "inclosure",
+  "beside",
+  "mill",
+  "din",
+  "bring",
+  "population",
+  "batter",
+  "midwife",
+  "roadside",
+  "picket",
+  "march",
+  "jacket",
+  "cast",
+  "seleton",
+  "handsome",
+  "force",
+  "abuse",
+  "stomp",
+  "subway",
+  "mimic",
+  "tax",
+  "newscast",
+  "krabby",
+  "plankton",
+  "horizon",
+  "hiker",
+  "downhill",
+  "right",
+  "hoodie",
+  "teen",
+  "style",
+  "goku",
+  "sade",
+  "charger",
+  "carbon",
+  "drain",
+  "kidney",
+  "lettuce",
+  "leafy",
+  "utensil",
+  "punching",
+  "kit",
+  "boxer",
+  "training",
+  "bunching",
+  "deconstruct",
+  "blockchain",
+  "symposium",
+  "jacobson",
+  "resort",
+  "accord",
+  "presenter",
+  "correspondent",
+  "son",
+  "clone",
+  "int",
+  "entertainment",
+  "ordinance",
+  "tha",
+  "audiance",
+  "tennisball",
+  "racket",
+  "snowboarder",
+  "audi",
+  "competation",
+  "ferrarus",
+  "clarkson",
+  "crowbar",
+  "stop",
+  "trainer",
+  "signature",
+  "commando",
+  "riot",
+  "interlace",
+  "swordfight",
+  "butt",
+  "fend",
+  "engage",
+  "battle",
+  "celery",
+  "scavenge",
+  "witch",
+  "motor",
+  "bracket",
+  "wto",
+  "green",
+  "johnny",
+  "sman",
+  "cause",
+  "entail",
+  "potatoe",
+  "careful",
+  "hils",
+  "quaker",
+  "mason",
+  "hostage",
+  "somali",
+  "creation",
+  "execute",
+  "rally",
+  "exchange",
+  "phil",
+  "churchill",
+  "churchiill",
+  "brawl",
+  "scuffle",
+  "cricket",
+  "laksa",
+  "pikachu",
+  "sampling",
+  "vidoe",
+  "glow",
+  "rise",
+  "receptor",
+  "tail",
+  "chid",
+  "babbling",
+  "chew",
+  "paparazzi",
+  "premier",
+  "nothing",
+  "marian",
+  "relationship",
+  "comapny",
+  "donate",
+  "cycle",
+  "bicycle",
+  "bycycling",
+  "sparkling",
+  "ribbon",
+  "suacepan",
+  "peter",
+  "griffin",
+  "connection",
+  "boxing",
+  "beatboxing",
+  "twitter",
+  "macbook",
+  "respond",
+  "watchmojo",
+  "empire",
+  "terrifying",
+  "llama",
+  "sheep",
+  "waterway",
+  "froth",
+  "pie",
+  "aluminum",
+  "alumminum",
+  "roasting",
+  "crust",
+  "aluminium",
+  "rent",
+  "analysi",
+  "professional",
+  "simulation",
+  "land",
+  "descend",
+  "measuring",
+  "measure",
+  "flavor",
+  "inform",
+  "fellow",
+  "gerbil",
+  "tunnel",
+  "wiggle",
+  "peep",
+  "stock",
+  "menu",
+  "divine",
+  "candidate",
+  "trump",
+  "donald",
+  "addiction",
+  "department",
+  "attire",
+  "bread",
+  "grandmother",
+  "fridge",
+  "blood",
+  "brain",
+  "harissa",
+  "compile",
+  "disgust",
+  "cutlet",
+  "prize",
+  "slope",
+  "mountainside",
+  "cannon",
+  "lantern",
+  "jack",
+  "projection",
+  "circulate",
+  "rod",
+  "copper",
+  "flute",
+  "length",
+  "intercourse",
+  "grog",
+  "radicalization",
+  "terrorism",
+  "ticker",
+  "broccoli",
+  "brolic",
+  "ramsey",
+  "gordon",
+  "pres",
+  "gift",
+  "hindi",
+  "rip",
+  "cartoonist",
+  "badminton",
+  "badmitten",
+  "batmanton",
+  "indoor",
+  "excite",
+  "surveillance",
+  "alleyway",
+  "integra",
+  "florida",
+  "backpack",
+  "everything",
+  "bollywood",
+  "acura",
+  "umbrella",
+  "spend",
+  "sausage",
+  "saussage",
+  "straightener",
+  "death",
+  "question",
+  "bagpiper",
+  "muffin",
+  "biscuit",
+  "showroom",
+  "dealer",
+  "glance",
+  "loi",
+  "eyebrow",
+  "reprimand",
+  "release",
+  "trancy",
+  "restaraunt",
+  "hive",
+  "beehive",
+  "woma",
+  "spice",
+  "rack",
+  "sloth",
+  "subscription",
+  "implore",
+  "subscribe",
+  "dominate",
+  "entourage",
+  "randomer",
+  "dollar",
+  "zynga",
+  "sillouette",
+  "rovio",
+  "facility",
+  "kung",
+  "ninja",
+  "karate",
+  "standoff",
+  "square",
+  "sprint",
+  "cart",
+  "aisle",
+  "purchase",
+  "grab",
+  "duck",
+  "refrigerate",
+  "foo",
+  "rowboat",
+  "opeing",
+  "goodbye",
+  "cycling",
+  "harley",
+  "baymax",
+  "mate",
+  "videotape",
+  "skydive",
+  "pluck",
+  "avril",
+  "number",
+  "count",
+  "hummer",
+  "motorcyclist",
+  "pulsate",
+  "sparkle",
+  "past",
+  "trip",
+  "prisoner",
+  "albino",
+  "gage",
+  "example",
+  "while",
+  "piano",
+  "countles",
+  "techno",
+  "cow",
+  "headbutt",
+  "level",
+  "station",
+  "railway",
+  "viewpoint",
+  "courtesy",
+  "george",
+  "receiver",
+  "tuxedo",
+  "stocking",
+  "donkey",
+  "robe",
+  "modify",
+  "cookie",
+  "napkin",
+  "hash",
+  "non",
+  "roast",
+  "tantrum",
+  "pillow",
+  "whine",
+  "freak",
+  "slave",
+  "solomon",
+  "plantation",
+  "northup",
+  "flint",
+  "bane",
+  "batman",
+  "footpath",
+  "kitchena",
+  "laboratory",
+  "gesticulate",
+  "pinnochio",
+  "cheerlead",
+  "self",
+  "cement",
+  "vegetation",
+  "mossy",
+  "sermon",
+  "fear",
+  "pastor",
+  "preach",
+  "notice",
+  "row",
+  "evaluate",
+  "kayak",
+  "competetion",
+  "kaya",
+  "panda",
+  "tend",
+  "inappropriate",
+  "log",
+  "basement",
+  "ladder",
+  "trap",
+  "latter",
+  "motorcyle",
+  "slab",
+  "reallity",
+  "creed",
+  "costume",
+  "handball",
+  "goalie",
+  "colbert",
+  "vacuum",
+  "sort",
+  "launch",
+  "popcorn",
+  "episode",
+  "wop",
+  "wheelchair",
+  "oliver",
+  "steven",
+  "enduro",
+  "unload",
+  "hardcore",
+  "faster",
+  "perfect",
+  "helmet",
+  "skiin",
+  "dirtbike",
+  "admire",
+  "vechile",
+  "expo",
+  "bulb",
+  "holiday",
+  "palace",
+  "crying",
+  "cinderella",
+  "soundtrack",
+  "kind",
+  "godmother",
+  "hitter",
+  "update",
+  "congressman",
+  "angela",
+  "countdown",
+  "leonardo",
+  "bunch",
+  "path",
+  "disease",
+  "battery",
+  "attachment",
+  "iron",
+  "hulk",
+  "drone",
+  "lining",
+  "aircraft",
+  "snack",
+  "household",
+  "establishment",
+  "criticise",
+  "anger",
+  "sunami",
+  "shore",
+  "commissioner",
+  "disagree",
+  "claim",
+  "bia",
+  "republican",
+  "author",
+  "cigarette",
+  "brad",
+  "poster",
+  "escalate",
+  "barn",
+  "crane",
+  "concentration",
+  "ecavator",
+  "selena",
+  "hip",
+  "outfit",
+  "dancehall",
+  "kitten",
+  "diffrerent",
+  "gossip",
+  "taylor",
+  "connect",
+  "concept",
+  "datum",
+  "marker",
+  "rubic",
+  "medieval",
+  "elf",
+  "rocket",
+  "infront",
+  "want",
+  "charlie",
+  "restroom",
+  "bladder",
+  "scatter",
+  "touchdown",
+  "coaster",
+  "fan",
+  "eminem",
+  "sprinter",
+  "tragedy",
+  "awe",
+  "stitch",
+  "sew",
+  "knit",
+  "dock",
+  "harbor",
+  "golfing",
+  "disconnect",
+  "rewire",
+  "cord",
+  "connector",
+  "bob",
+  "bop",
+  "spongue",
+  "spacey",
+  "nod",
+  "pay",
+  "john",
+  "fox",
+  "intolerance",
+  "pile",
+  "speaking",
+  "pant",
+  "yawn",
+  "bark",
+  "mock",
+  "puppy",
+  "argument",
+  "cheetum",
+  "cheetah",
+  "cheetha",
+  "taw",
+  "gee",
+  "duel",
+  "celebraty",
+  "stack",
+  "laundry",
+  "clothe",
+  "sheet",
+  "clothesline",
+  "lil",
+  "biz",
+  "amn",
+  "sample",
+  "hanson",
+  "slip",
+  "deer",
+  "foothold",
+  "raindeer",
+  "nfl",
+  "cello",
+  "living",
+  "forklift",
+  "somone",
+  "clipart",
+  "mobility",
+  "tinker",
+  "click",
+  "weighter",
+  "decoration",
+  "bank",
+  "musician",
+  "recite",
+  "soccerball",
+  "hokey",
+  "hokie",
+  "understand",
+  "eclipse",
+  "moo",
+  "baraka",
+  "actin",
+  "fluid",
+  "takeout",
+  "preparation",
+  "bimbo",
+  "charmander",
+  "osbourne",
+  "ozbourne",
+  "catcher",
+  "anything",
+  "edm",
+  "rop",
+  "swing",
+  "cradle",
+  "flare",
+  "tornado",
+  "solar",
+  "burst",
+  "finish",
+  "cursor",
+  "screenshot",
+  "seafood",
+  "shell",
+  "cleaver",
+  "crap",
+  "highlight",
+  "knead",
+  "masseuse",
+  "palor",
+  "inblue",
+  "column",
+  "miniature",
+  "behavior",
+  "pizza",
+  "pope",
+  "bles",
+  "surge",
+  "rhino",
+  "hippo",
+  "moped",
+  "gasket",
+  "chief",
+  "thaw",
+  "obscure",
+  "port",
+  "visibility",
+  "nemesi",
+  "musc",
+  "backdrop",
+  "bride",
+  "heartbroken",
+  "hakeem",
+  "underwear",
+  "doodlebug",
+  "nolan",
+  "castle",
+  "dungeon",
+  "glassware",
+  "dispener",
+  "avoid",
+  "accessory",
+  "clutch",
+  "steering",
+  "warm",
+  "stretch",
+  "guitor",
+  "ultron",
+  "backflip",
+  "watering",
+  "wise",
+  "lawsuit",
+  "demon",
+  "overdose",
+  "paranormal",
+  "bondage",
+  "veg",
+  "plank",
+  "loading",
+  "buffer",
+  "wildernes",
+  "wwe",
+  "mysterio",
+  "tear",
+  "wool",
+  "fabric",
+  "italy",
+  "volkswagen",
+  "wheel",
+  "passenger",
+  "funnel",
+  "unpack",
+  "carton",
+  "oragami",
+  "advance",
+  "grandfather",
+  "slap",
+  "sir",
+  "student",
+  "classmate",
+  "burger",
+  "stew",
+  "fellowman",
+  "restraunt",
+  "kimchi",
+  "today",
+  "ceremony",
+  "attention",
+  "vistor",
+  "visitor",
+  "mount",
+  "dugout",
+  "mound",
+  "championship",
+  "vedio",
+  "jungle",
+  "statement",
+  "restaurent",
+  "dez",
+  "shield",
+  "eyelid",
+  "shane",
+  "monologue",
+  "oscar",
+  "podcast",
+  "obsitcle",
+  "bin",
+  "donation",
+  "kindnes",
+  "multus",
+  "columbium",
+  "spielberg",
+  "kitty",
+  "emission",
+  "outlet",
+  "coyote",
+  "howl",
+  "pray",
+  "televisoin",
+  "skit",
+  "mifa",
+  "countryside",
+  "belt",
+  "independence",
+  "tuck",
+  "blanket",
+  "blancket",
+  "champagne",
+  "card",
+  "complaining",
+  "albinism",
+  "discvery",
+  "shooter",
+  "sledge",
+  "feminism",
+  "navy",
+  "leak",
+  "sweat",
+  "broccoili",
+  "reciepus",
+  "bakery",
+  "nightime",
+  "commit",
+  "swingset",
+  "suicide",
+  "finishing",
+  "smoking",
+  "tutor",
+  "trance",
+  "grandma",
+  "blast",
+  "punk",
+  "rehearse",
+  "headrest",
+  "showing",
+  "cone",
+  "symbol",
+  "japan",
+  "terrace",
+  "wifi",
+  "bathtub",
+  "straw",
+  "bathwater",
+  "drinking",
+  "suck",
+  "freeman",
+  "morgan",
+  "stork",
+  "poach",
+  "outbreak",
+  "dicing",
+  "dice",
+  "producer",
+  "drum",
+  "percussion",
+  "narration",
+  "inventory",
+  "mushroom",
+  "select",
+  "peephole",
+  "society",
+  "healthcare",
+  "sharpie",
+  "notebook",
+  "packet",
+  "future",
+  "munch",
+  "llamma",
+  "elaborate",
+  "fixing",
+  "description",
+  "giraffe",
+  "mickey",
+  "mousekersize",
+  "walt",
+  "adult",
+  "syringe",
+  "dropper",
+  "condensation",
+  "leash",
+  "dye",
+  "wagon",
+  "camp",
+  "stump",
+  "campsite",
+  "upbeat",
+  "everyone",
+  "limp",
+  "weapon",
+  "idiot",
+  "conductor",
+  "beautician",
+  "colmy",
+  "simplify",
+  "lapse",
+  "trampoline",
+  "jewelry",
+  "muffler",
+  "subframe",
+  "curvy",
+  "celebration",
+  "anniversay",
+  "mariah",
+  "anniversary",
+  "romance",
+  "doorbell",
+  "shrimp",
+  "everest",
+  "five",
+  "seminar",
+  "perry",
+  "solor",
+  "trade",
+  "hogan",
+  "pople",
+  "demontrating",
+  "envy",
+  "groud",
+  "valencium",
+  "womean",
+  "marinate",
+  "comedian",
+  "viral",
+  "spounge",
+  "manipulate",
+  "knot",
+  "helicoptor",
+  "location",
+  "decide",
+  "mountaintop",
+  "skyanother",
+  "2013",
+  "runnig",
+  "winner",
+  "gizmo",
+  "kajak",
+  "extraction",
+  "technician",
+  "tissue",
+  "duggar",
+  "stonehenge",
+  "afar",
+  "branch",
+  "wine",
+  "flub",
+  "mean",
+  "consist",
+  "accapella",
+  "thunder",
+  "lick",
+  "loosen",
+  "blogger",
+  "close",
+  "baking",
+  "curse",
+  "brutality",
+  "weird",
+  "flail",
+  "whip",
+  "manchild",
+  "plunge",
+  "splash",
+  "thrash",
+  "recount",
+  "cheat",
+  "cupboard",
+  "organizer",
+  "refrigerator",
+  "shelf",
+  "accident",
+  "tamato",
+  "scout",
+  "iraq",
+  "surveliance",
+  "engineer",
+  "perfomer",
+  "kenium",
+  "grafitus",
+  "rejuvenate",
+  "moisturizer",
+  "fake",
+  "salada",
+  "series",
+  "tokyo",
+  "weather",
+  "leap",
+  "variety",
+  "watermelon",
+  "pastum",
+  "buffet",
+  "chaser",
+  "recreate",
+  "tire",
+  "bush",
+  "bok",
+  "tibet",
+  "grassland",
+  "beyonce",
+  "laura",
+  "cottage",
+  "strain",
+  "frown",
+  "merry",
+  "broom",
+  "gameboy",
+  "broaden",
+  "content",
+  "knowledge",
+  "trust",
+  "lunch",
+  "atray",
+  "buff",
+  "hitch",
+  "gameone",
+  "daring",
+  "rabbit",
+  "keeper",
+  "ina",
+  "reaper",
+  "killer",
+  "specialist",
+  "psychoanalyst",
+  "counselor",
+  "duet",
+  "melody",
+  "arthur",
+  "emanate",
+  "rpm",
+  "firing",
+  "explosion",
+  "escape",
+  "scenario",
+  "entertainer",
+  "spill",
+  "carve",
+  "casa",
+  "gordan",
+  "btalking",
+  "spectacle",
+  "corn",
+  "mixie",
+  "coloring",
+  "intersection",
+  "gla",
+  "lime",
+  "kapil",
+  "stalk",
+  "handler",
+  "sack",
+  "procol",
+  "puma",
+  "randolph",
+  "peetum",
+  "hunger",
+  "mince",
+  "accros",
+  "therapist",
+  "togethernes",
+  "counciling",
+  "lawyer",
+  "beer",
+  "beverage",
+  "garb",
+  "hoist",
+  "autograph",
+  "possession",
+  "egypt",
+  "engaging",
+  "careear",
+  "proone",
+  "session",
+  "edward",
+  "convince",
+  "pail",
+  "conveyer",
+  "wet",
+  "babality",
+  "defeat",
+  "humiliate",
+  "mortal",
+  "cnbc",
+  "leather",
+  "scarf",
+  "esteban",
+  "ricky",
+  "spmone",
+  "highschool",
+  "extention",
+  "michael",
+  "deandre",
+  "chug",
+  "climbinb",
+  "rating",
+  "tilt",
+  "playstation",
+  "hook",
+  "viedo",
+  "note",
+  "string",
+  "thread",
+  "sense",
+  "orbit",
+  "peruse",
+  "compost",
+  "gardening",
+  "robert",
+  "controller",
+  "theme",
+  "price",
+  "inspector",
+  "friendship",
+  "rolling",
+  "spinning",
+  "mcgeorge",
+  "balcony",
+  "tortoise",
+  "popsicle",
+  "squint",
+  "alongwith",
+  "primate",
+  "expression",
+  "macaque",
+  "wok",
+  "prawn",
+  "resist",
+  "rape",
+  "scale",
+  "gorden",
+  "hiw",
+  "miscarriage",
+  "dealership",
+  "sportsman",
+  "spatula",
+  "gopher",
+  "beaver",
+  "whisk",
+  "moment",
+  "wafer",
+  "complex",
+  "squish",
+  "vollyball",
+  "employee",
+  "dispense",
+  "worker",
+  "icecream",
+  "badmington",
+  "shuttlecock",
+  "tourney",
+  "mistake",
+  "goald",
+  "proficiency",
+  "skip",
+  "ariel",
+  "cupcake",
+  "mermaid",
+  "zipper",
+  "fantasy",
+  "devastate",
+  "compute",
+  "brochure",
+  "booklet",
+  "lobby",
+  "split",
+  "sunda",
+  "terrain",
+  "skiing",
+  "britain",
+  "popularity",
+  "coerce",
+  "coo",
+  "mama",
+  "badmitton",
+  "buttock",
+  "chrissy",
+  "verentum",
+  "railing",
+  "physician",
+  "injection",
+  "inject",
+  "beg",
+  "cock",
+  "victorium",
+  "manufacture",
+  "marburg",
+  "overlaid",
+  "warning",
+  "arch",
+  "entrance",
+  "torque",
+  "faster0",
+  "playlist",
+  "online",
+  "opinion",
+  "marketplace",
+  "goalpost",
+  "proposition",
+  "prostitute",
+  "cctv",
+  "situation",
+  "encounter",
+  "chrocodile",
+  "tempt",
+  "transmission",
+  "temple",
+  "architecture",
+  "travelogue",
+  "demonstrating",
+  "tech",
+  "tes",
+  "starburst",
+  "thailand",
+  "marijuana",
+  "witha",
+  "foreigner",
+  "forehead",
+  "lannister",
+  "dinklage",
+  "lad",
+  "constume",
+  "katty",
+  "katy",
+  "chopter",
+  "hoverbike",
+  "hover",
+  "generation",
+  "apron",
+  "axe",
+  "injury",
+  "safty",
+  "wound",
+  "grande",
+  "ariana",
+  "jason",
+  "honey",
+  "cotton",
+  "stopper",
+  "linu",
+  "intro",
+  "indie",
+  "limit",
+  "personality",
+  "plasticine",
+  "seek",
+  "armor",
+  "controversry",
+  "straddle",
+  "fingernail",
+  "sautee",
+  "patty",
+  "caharacter",
+  "redhead",
+  "alphabet",
+  "christian",
+  "plug",
+  "richkidstv",
+  "resturant",
+  "sweep",
+  "unroll",
+  "dust",
+  "circuit",
+  "jail",
+  "agent",
+  "magician",
+  "interrogate",
+  "tortilla",
+  "hymn",
+  "religion",
+  "miley",
+  "fallon",
+  "jimmy",
+  "decision",
+  "jim",
+  "snoop",
+  "escort",
+  "motocros",
+  "info",
+  "mas",
+  "extract",
+  "multimedium",
+  "scorpion",
+  "furnace",
+  "incinerator",
+  "jackson",
+  "weekend",
+  "cristiano",
+  "suspense",
+  "transform",
+  "vilon",
+  "tamil",
+  "smock",
+  "combine",
+  "ginger",
+  "dhal",
+  "listing",
+  "tomato",
+  "mation",
+  "candlelight",
+  "defend",
+  "doomsday",
+  "malibu",
+  "defending",
+  "power",
+  "mmorpg",
+  "backyard",
+  "illnes",
+  "condition",
+  "papaer",
+  "unwrap",
+  "gameboard",
+  "nutraloaf",
+  "prison",
+  "ease",
+  "feminine",
+  "moar",
+  "solicit",
+  "minion",
+  "auto",
+  "offroad",
+  "dune",
+  "tumbler",
+  "winan",
+  "freezer",
+  "storm",
+  "bbq",
+  "club",
+  "retaurant",
+  "parking",
+  "greg",
+  "paino",
+  "rhyme",
+  "nintendo",
+  "coconut",
+  "catapult",
+  "manufacturing",
+  "warehouse",
+  "pursue",
+  "reassure",
+  "plan",
+  "blouse",
+  "bathrom",
+  "interviewing",
+  "jon",
+  "condolence",
+  "memorial",
+  "homage",
+  "audiomicro",
+  "sugar",
+  "cinnamon",
+  "sponebob",
+  "imitate",
+  "recreation",
+  "pressconference",
+  "antler",
+  "moose",
+  "murry",
+  "headset",
+  "painting",
+  "trench",
+  "globemaster",
+  "stoner",
+  "skyscraper",
+  "cabin",
+  "orient",
+  "stiller",
+  "highdive",
+  "statuesque",
+  "elephant",
+  "bump",
+  "suscribe",
+  "appearance",
+  "limousine",
+  "hovercopter",
+  "hovercraft",
+  "voyage",
+  "jumbping",
+  "surgery",
+  "administer",
+  "nickelodeon",
+  "clicker",
+  "brunnette",
+  "mark",
+  "loarge",
+  "unison",
+  "reverse",
+  "cub",
+  "cartwheel",
+  "saute",
+  "tyga",
+  "heavyset",
+  "puddle",
+  "district",
+  "weigh",
+  "advertising",
+  "promotion",
+  "couh",
+  "charleston",
+  "nidoran",
+  "pokelon",
+  "blend",
+  "drill",
+  "pulp",
+  "watermellon",
+  "vie",
+  "sportscaster",
+  "sunset",
+  "pork",
+  "fair",
+  "tin",
+  "xbox",
+  "jumper",
+  "gol",
+  "spiderman",
+  "batmobile",
+  "coacher",
+  "gymnist",
+  "fastball",
+  "kukumapoo",
+  "saffron",
+  "audio",
+  "idea",
+  "thi",
+  "reason",
+  "isn",
+  "introductory",
+  "planist",
+  "pianist",
+  "anticipation",
+  "legislation",
+  "marijuan",
+  "legalize",
+  "hockey",
+  "morgue",
+  "case",
+  "edge",
+  "kombat",
+  "spew",
+  "workout",
+  "playzone",
+  "wtwo",
+  "flatbread",
+  "vendor",
+  "naan",
+  "pound",
+  "sunlight",
+  "traveller",
+  "capacity",
+  "picturesque",
+  "ropway",
+  "chevy",
+  "balloon",
+  "jet",
+  "dub",
+  "role",
+  "indiana",
+  "quote",
+  "fedora",
+  "paragraph",
+  "trudge",
+  "curtain",
+  "nalepianki",
+  "skrobanki",
+  "desolate",
+  "polo",
+  "badge",
+  "crack",
+  "ineterview",
+  "sneeze",
+  "strobe",
+  "vodka",
+  "burray",
+  "biotechnology",
+  "heal",
+  "criminal",
+  "reenact",
+  "void",
+  "manage",
+  "frog",
+  "kermit",
+  "mazda",
+  "vortex",
+  "liquicd",
+  "elect",
+  "winnie",
+  "writhe",
+  "fact",
+  "tehcno",
+  "fanning",
+  "commentating",
+  "mole",
+  "accept",
+  "loom",
+  "twist",
+  "cadaver",
+  "commuter",
+  "sassy",
+  "makup",
+  "cyber",
+  "restaurand",
+  "doe",
+  "violence",
+  "plumber",
+  "chararacter",
+  "intelligence",
+  "algorithm",
+  "gesture",
+  "sugarflor",
+  "hurdle",
+  "copetision",
+  "iceland",
+  "paraglide",
+  "mustang",
+  "nicki",
+  "nikki",
+  "compliment",
+  "attraction",
+  "pesron",
+  "core",
+  "ghul",
+  "stab",
+  "smith",
+  "moma",
+  "zoo",
+  "calony",
+  "survival",
+  "bee",
+  "minature",
+  "saucer",
+  "iteam",
+  "reviewer",
+  "delevigne",
+  "mountion",
+  "sum",
+  "scooba",
+  "official",
+  "tur",
+  "slash",
+  "racoon",
+  "photgraph",
+  "magnet",
+  "megaphone",
+  "alpasca",
+  "animate",
+  "excitement",
+  "hayatochirus",
+  "shopit",
+  "voilin",
+  "homerun",
+  "puppet",
+  "daycare",
+  "roofer",
+  "adhesive",
+  "behave",
+  "liner",
+  "curly",
+  "headphone",
+  "seduce",
+  "passion",
+  "saree",
+  "mourn",
+  "dismis",
+  "dialysi",
+  "tender",
+  "salmon",
+  "wink",
+  "assessment",
+  "finland",
+  "passi",
+  "avenger",
+  "uniform",
+  "law",
+  "tense",
+  "thrill",
+  "franchise",
+  "laughter",
+  "standup",
+  "paneer",
+  "archor",
+  "newtown",
+  "megan",
+  "faucet",
+  "cus",
+  "eyeshadow",
+  "dragon",
+  "bet",
+  "pattern",
+  "nerf",
+  "fondle",
+  "reckles",
+  "sportscar",
+  "public",
+  "midair",
+  "norton",
+  "soak",
+  "squeeze",
+  "sandra",
+  "crisi",
+  "smashbrother",
+  "supersmash",
+  "wild",
+  "encouragement",
+  "barefoot",
+  "ammunition",
+  "ammo",
+  "workspace",
+  "coma",
+  "scar",
+  "expectancy",
+  "rockband",
+  "sceneary",
+  "glare",
+  "invention",
+  "wardrobe",
+  "antique",
+  "volkswagon",
+  "pair",
+  "huddle",
+  "cold",
+  "putty",
+  "samsung",
+  "muscle",
+  "daytona",
+  "datona",
+  "pocket",
+  "wordgirl",
+  "artillery",
+  "exist",
+  "breathtaking",
+  "cannonball",
+  "beanbag",
+  "pinatum",
+  "boosie",
+  "valence",
+  "nucleu",
+  "electron",
+  "argon",
+  "clarinet",
+  "shiver",
+  "mattres",
+  "moschimo",
+  "togetum",
+  "misty",
+  "impres",
+  "manican",
+  "anaconda",
+  "hoax",
+  "boa",
+  "scorpio",
+  "wheeler",
+  "career",
+  "aldy",
+  "charlotte",
+  "thumb",
+  "trinity",
+  "wag",
+  "shelter",
+  "register",
+  "trapeze",
+  "stopwatch",
+  "trapeeze",
+  "spot",
+  "strike",
+  "maker",
+  "smooth",
+  "hay",
+  "stall",
+  "bang",
+  "emotion",
+  "dragnet",
+  "appoint",
+  "crib",
+  "penguin",
+  "cadillac",
+  "sneak",
+  "classrooom",
+  "pause",
+  "cane",
+  "bbqing",
+  "digging",
+  "com",
+  "graze",
+  "console",
+  "cucumber",
+  "editing",
+  "illustration",
+  "snakeskin",
+  "handmade",
+  "amongst",
+  "locality",
+  "choke",
+  "conception",
+  "custom",
+  "pewd",
+  "pewdiepie",
+  "muck",
+  "monica",
+  "undertake",
+  "voltage",
+  "dehydration",
+  "sponsor",
+  "indicate",
+  "surfing",
+  "cargo",
+  "pimp",
+  "barbeque",
+  "hearthstone",
+  "cherokee",
+  "minnie",
+  "fluffy",
+  "sportswoman",
+  "soul",
+  "curl",
+  "tag",
+  "lure",
+  "gilr",
+  "brazier",
+  "bra",
+  "judgement",
+  "somemovie",
+  "terminator",
+  "extend",
+  "washing",
+  "blank",
+  "athelete",
+  "marshawn",
+  "shove",
+  "fighting",
+  "seal",
+  "sight",
+  "developmet",
+  "ipad",
+  "jerry",
+  "investment",
+  "whitney",
+  "madison",
+  "climate",
+  "aviation",
+  "froot",
+  "hallelujah",
+  "anchorman",
+  "inspection",
+  "switch",
+  "microscope",
+  "hocky",
+  "monopoly",
+  "electricity",
+  "spark",
+  "placement",
+  "sleeping",
+  "rubick",
+  "tattoo",
+  "canva",
+  "snatch",
+  "pitt",
+  "daydream",
+  "disturbing",
+  "hypnotist",
+  "wan",
+  "commander",
+  "jawsome",
+  "forza",
+  "lui",
+  "prey",
+  "burrow",
+  "backside",
+  "saw",
+  "account",
+  "trash",
+  "freeform",
+  "poker",
+  "ochocinco",
+  "espn",
+  "pronounce",
+  "join",
+  "territory",
+  "stunt",
+  "mikey",
+  "tug",
+  "tugowar",
+  "bloom",
+  "breeze",
+  "champaka",
+  "greenery",
+  "cycyle",
+  "goliath",
+  "spit",
+  "cermony",
+  "bestow",
+  "meditate",
+  "alloy",
+  "rim",
+  "wonder",
+  "cihldren",
+  "firehouse",
+  "croquette",
+  "skirt",
+  "sheff",
+  "dungenes",
+  "calico",
+  "mistres",
+  "secssion",
+  "dicaprio",
+  "dhoni",
+  "toyotum",
+  "hope",
+  "unrest",
+  "boyband",
+  "lock",
+  "executioner",
+  "bystander",
+  "execution",
+  "south",
+  "robin",
+  "defense",
+  "protective",
+  "mnay",
+  "chomp",
+  "anatomy",
+  "dissect",
+  "exhibition",
+  "jeremy",
+  "cocktail",
+  "downey",
+  "adjustment",
+  "kickball",
+  "knuckleball",
+  "advise",
+  "server",
+  "entree",
+  "sirector",
+  "amazing",
+  "bonu",
+  "diddy",
+  "tabletop",
+  "sopmeone",
+  "wax",
+  "removal",
+  "forearm",
+  "walkthrough",
+  "biking",
+  "vitamin",
+  "choclate",
+  "refridegerator",
+  "protein",
+  "islamophobium",
+  "pact",
+  "gabriel",
+  "pongal",
+  "downloading",
+  "over-emphasizing",
+  "harmony",
+  "housing",
+  "chubby",
+  "calmly",
+  "oxygen",
+  "underwater",
+  "gorge",
+  "thick",
+  "diner",
+  "suitcase",
+  "breifcase",
+  "submarine",
+  "pressure",
+  "yoga",
+  "carter",
+  "flap",
+  "parkor",
+  "spare",
+  "bowling",
+  "bowler",
+  "comdey",
+  "slopestyle",
+  "section",
+  "promo",
+  "mathew",
+  "spacecraft",
+  "mcconaughy",
+  "bonfire",
+  "bonefire",
+  "brake",
+  "carbreak",
+  "rotor",
+  "recipie",
+  "stimulate",
+  "europe",
+  "groundwater",
+  "outer",
+  "seep",
+  "widow",
+  "johansson",
+  "ten",
+  "vessel",
+  "germany",
+  "labor",
+  "gauge",
+  "garnish",
+  "biker",
+  "pore",
+  "recepie",
+  "mspaint",
+  "furry",
+  "inch",
+  "worm",
+  "caterpillar",
+  "wate",
+  "hula",
+  "footnall",
+  "compose",
+  "saddle",
+  "trot",
+  "backboard",
+  "dwight",
+  "bus",
+  "neighborhood",
+  "schoolbu",
+  "prince",
+  "bambi",
+  "banana",
+  "ulton",
+  "crossbar",
+  "typewriter",
+  "trio",
+  "ufc",
+  "frypan",
+  "consumer",
+  "snuggle",
+  "industrialist",
+  "cramer",
+  "mpv",
+  "spokesman",
+  "toast",
+  "motherboard",
+  "surprise",
+  "improvisation",
+  "perosn",
+  "acrobat",
+  "lobster",
+  "stain",
+  "beige",
+  "year",
+  "blackberry",
+  "storyline",
+  "walking",
+  "beeker",
+  "beaker",
+  "solution",
+  "derrick",
+  "unicycle",
+  "loiter",
+  "bose",
+  "funny",
+  "desktop",
+  "mac",
+  "crossing",
+  "recording",
+  "apology",
+  "bunker",
+  "argentina",
+  "paw",
+  "displaying",
+  "batterie",
+  "propeller",
+  "clockwise",
+  "costar",
+  "decease",
+  "lsd",
+  "mating",
+  "hump",
+  "copulate",
+  "doom",
+  "chemical",
+  "whelm",
+  "proposal",
+  "tabletenni",
+  "breath",
+  "hemsworth",
+  "acting",
+  "helmsworth",
+  "twin",
+  "pertain",
+  "intensity",
+  "fork",
+  "cower",
+  "insect",
+  "nerd",
+  "disappointment",
+  "zip",
+  "network",
+  "collaboration",
+  "anyone",
+  "communication",
+  "hai",
+  "aisain",
+  "raid",
+  "dustbin",
+  "dumpster",
+  "mailbox",
+  "aerobic",
+  "canyon",
+  "cody",
+  "analyze",
+  "vapor",
+  "roacket",
+  "micky",
+  "fairie",
+  "lamb",
+  "nursery",
+  "initiative",
+  "precision",
+  "photoshoot",
+  "genital",
+  "narrarator",
+  "sygornie",
+  "sigourney",
+  "re-enacting",
+  "pikini",
+  "beggar",
+  "lend",
+  "throne",
+  "monarchy",
+  "grace",
+  "daisy",
+  "twain",
+  "produce",
+  "peal",
+  "moto",
+  "wreck",
+  "mcdonald",
+  "roaed",
+  "darken",
+  "dressing",
+  "reno",
+  "len",
+  "groin",
+  "eavesdrop",
+  "eavesdropping",
+  "distraught",
+  "dispenser",
+  "exhilarate",
+  "exhilarating",
+  "designate",
+  "programming",
+  "globalization",
+  "astage",
+  "lip",
+  "foodball",
+  "phrenology",
+  "phremology",
+  "america",
+  "hitler",
+  "discrimination",
+  "racism",
+  "houser",
+  "pragmatism",
+  "swimmer",
+  "hank",
+  "medication",
+  "prescription",
+  "transport",
+  "adulthood",
+  "nightclub",
+  "wig",
+  "overlay",
+  "meter",
+  "800",
+  "footrace",
+  "siphon",
+  "firmware",
+  "seagull",
+  "importance",
+  "accompany",
+  "eight",
+  "husband",
+  "replacement",
+  "machince",
+  "uner",
+  "bronze",
+  "trim",
+  "laser",
+  "restarunt",
+  "stucture",
+  "surgeon",
+  "nanny",
+  "misbehave",
+  "floating",
+  "mule",
+  "hamburger",
+  "iowa",
+  "lowa",
+  "brief",
+  "cleanong",
+  "solo",
+  "tar",
+  "homemade",
+  "winston",
+  "peole",
+  "agreement",
+  "unemployment",
+  "baton",
+  "woody",
+  "costumer",
+  "literature",
+  "campaigning",
+  "presidency",
+  "refuse",
+  "courier",
+  "beat-boxing",
+  "fillet",
+  "relay",
+  "ping",
+  "request",
+  "kircher",
+  "tim",
+  "convulse",
+  "assail",
+  "intrude",
+  "gourmet",
+  "gingerbread",
+  "recode",
+  "foreground",
+  "ditch",
+  "playdoh",
+  "dexter",
+  "char",
+  "daryl",
+  "showcase",
+  "mand",
+  "abot",
+  "cheery",
+  "sunstance",
+  "album",
+  "fireplace",
+  "resistance",
+  "bga",
+  "larry",
+  "tune",
+  "geographic",
+  "spiel",
+  "alternator",
+  "installing",
+  "brag",
+  "berate",
+  "womanb",
+  "bumper",
+  "dent",
+  "prus",
+  "daylight",
+  "natuiralist",
+  "halloween",
+  "sirloin",
+  "pinocchio",
+  "bussaly",
+  "contmy",
+  "foosball",
+  "cocoon",
+  "burqa",
+  "impersonate",
+  "fingure",
+  "primer",
+  "carson",
+  "feeding",
+  "parent"
+]

ChatUniVi/eval/questions/video_qa/msrvtt_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/msvd_a_list.json ADDED Viewed

	@@ -0,0 +1,1202 @@

+[
+  "someone",
+  "rifle",
+  "man",
+  "box",
+  "gun",
+  "person",
+  "weapon",
+  "contain",
+  "opine",
+  "show",
+  "sword",
+  "boy",
+  "kid",
+  "fight",
+  "twirl",
+  "put",
+  "demonstrate",
+  "dance",
+  "practice",
+  "two",
+  "gras",
+  "demonstration",
+  "karate",
+  "perform",
+  "path",
+  "wildernes",
+  "trail",
+  "road",
+  "style",
+  "lane",
+  "actor",
+  "walk",
+  "eagle",
+  "motion",
+  "hare",
+  "rabbit",
+  "bird",
+  "bunny",
+  "field",
+  "dodge",
+  "try",
+  "chase",
+  "water",
+  "cliff",
+  "river",
+  "ocean",
+  "rock",
+  "dive",
+  "jump",
+  "crust",
+  "dough",
+  "chef",
+  "cook",
+  "lay",
+  "spread",
+  "unroll",
+  "cut",
+  "roll",
+  "guitar",
+  "play",
+  "cat",
+  "dog",
+  "bite",
+  "break",
+  "burst",
+  "pop",
+  "attack",
+  "girl",
+  "ball",
+  "toddler",
+  "baby",
+  "yard",
+  "animal",
+  "throw",
+  "motorcycle",
+  "ground",
+  "bike",
+  "stunt",
+  "wheel",
+  "motorcyle",
+  "guy",
+  "motorbike",
+  "minibike",
+  "wheelie",
+  "dachshund",
+  "pool",
+  "swim",
+  "stage",
+  "concert",
+  "jeff",
+  "pyramid",
+  "school",
+  "form",
+  "kneel",
+  "climb",
+  "build",
+  "target",
+  "pistol",
+  "machine",
+  "glock",
+  "fire",
+  "shoot",
+  "stand",
+  "woman",
+  "knife",
+  "vegetable",
+  "lady",
+  "leafy",
+  "parsley",
+  "herb",
+  "coriander",
+  "chop",
+  "garlic",
+  "turnip",
+  "bowl",
+  "root",
+  "plant",
+  "pull",
+  "clean",
+  "mushroom",
+  "tapioca",
+  "cauliflower",
+  "food",
+  "dice",
+  "use",
+  "slouse",
+  "mall",
+  "jean",
+  "airport",
+  "onion",
+  "phone",
+  "telephone",
+  "hang",
+  "speak",
+  "talk",
+  "hold",
+  "hand",
+  "arm",
+  "star",
+  "shin",
+  "gaze",
+  "reach",
+  "stretch",
+  "watch",
+  "guinea",
+  "lettuce",
+  "vegetation",
+  "eat",
+  "tree",
+  "rain",
+  "polouse",
+  "vehicle",
+  "car",
+  "highway",
+  "maneuver",
+  "driver",
+  "cop",
+  "evade",
+  "force",
+  "pit",
+  "shrimp",
+  "shell",
+  "peel",
+  "prepare",
+  "prawn",
+  "fish",
+  "seafood",
+  "worm",
+  "tail",
+  "stuff",
+  "seasoning",
+  "starch",
+  "cover",
+  "season",
+  "leek",
+  "fruit",
+  "leak",
+  "somoene",
+  "batter",
+  "substance",
+  "flmy",
+  "flower",
+  "mix",
+  "pmy",
+  "brush",
+  "tempura",
+  "coat",
+  "spice",
+  "butter",
+  "fry",
+  "meat",
+  "dip",
+  "place",
+  "bread",
+  "oil",
+  "pot",
+  "something",
+  "mitsuba",
+  "frying",
+  "tray",
+  "grill",
+  "liquid",
+  "stir",
+  "cilantro",
+  "udon",
+  "pastum",
+  "boil",
+  "foodstuff",
+  "biscuit",
+  "sushi",
+  "wrapper",
+  "cheese",
+  "kamaboko",
+  "package",
+  "unwrap",
+  "wood",
+  "pink",
+  "cake",
+  "purple",
+  "kamakodo",
+  "lemon",
+  "rind",
+  "piece",
+  "skin",
+  "orange",
+  "container",
+  "hydrate",
+  "date",
+  "shiitake",
+  "fig",
+  "cros",
+  "somene",
+  "thing",
+  "score",
+  "broth",
+  "saute",
+  "drive",
+  "violin",
+  "microphone",
+  "instrument",
+  "fiddle",
+  "pluck",
+  "steak",
+  "pork",
+  "poke",
+  "pierce",
+  "pepper",
+  "sprinkle",
+  "plate",
+  "egg",
+  "dish",
+  "drop",
+  "street",
+  "motocycle",
+  "motor",
+  "lie",
+  "pitcher",
+  "soda",
+  "juic",
+  "juice",
+  "mixture",
+  "jug",
+  "jar",
+  "cucumber",
+  "glas",
+  "drinking",
+  "drink",
+  "four",
+  "explosion",
+  "blast",
+  "blow",
+  "burn",
+  "couch",
+  "sit",
+  "sleeping",
+  "groom",
+  "lick",
+  "superman",
+  "superhero",
+  "door",
+  "superhuman",
+  "giant",
+  "character",
+  "move",
+  "push",
+  "lift",
+  "individual",
+  "dirt",
+  "biker",
+  "bump",
+  "air",
+  "cycle",
+  "fly",
+  "bed",
+  "nap",
+  "sleep",
+  "wake",
+  "waken",
+  "hammer",
+  "thumb",
+  "finger",
+  "hit",
+  "smash",
+  "rope",
+  "walkway",
+  "pathway",
+  "house",
+  "skip",
+  "jumproping",
+  "jockey",
+  "horse",
+  "perimeter",
+  "stadium",
+  "ring",
+  "arena",
+  "competition",
+  "wrestle",
+  "cockfight",
+  "trumpet",
+  "club",
+  "song",
+  "menacing",
+  "clown",
+  "band",
+  "menace",
+  "look",
+  "beef",
+  "fat",
+  "roast",
+  "trim",
+  "bowler",
+  "split",
+  "bowling",
+  "remain",
+  "board",
+  "flip",
+  "doctor",
+  "emergency",
+  "chest",
+  "patient",
+  "gurney",
+  "hospital",
+  "die",
+  "bleed",
+  "help",
+  "content",
+  "chili",
+  "tin",
+  "dump",
+  "mixer",
+  "blender",
+  "cream",
+  "whisk",
+  "knead",
+  "whip",
+  "chimpanzee",
+  "cigarette",
+  "smoking",
+  "monkey",
+  "smoke",
+  "room",
+  "christma",
+  "wear",
+  "turtle",
+  "tortoise",
+  "hunt",
+  "ice",
+  "skate",
+  "sandwich",
+  "mayonnaise",
+  "equipment",
+  "mechanic",
+  "speaker",
+  "loudspeaker",
+  "join",
+  "portion",
+  "outer",
+  "clove",
+  "peal",
+  "mince",
+  "capsicum",
+  "bell",
+  "center",
+  "whole",
+  "half",
+  "apple",
+  "wall",
+  "building",
+  "picture",
+  "camera",
+  "photograph",
+  "milk",
+  "sauce",
+  "football",
+  "soccer",
+  "otter",
+  "beaver",
+  "slide",
+  "mango",
+  "avocado",
+  "lengthwise",
+  "squash",
+  "chew",
+  "mes",
+  "tease",
+  "gnaw",
+  "sniff",
+  "grocery",
+  "stroller",
+  "pram",
+  "store",
+  "supermarket",
+  "cart",
+  "bay",
+  "pay",
+  "forest",
+  "deer",
+  "pas",
+  "roam",
+  "head",
+  "side",
+  "stove",
+  "skillet",
+  "sizzle",
+  "machinthi",
+  "kanu",
+  "hole",
+  "drill",
+  "block",
+  "rhino",
+  "rhinocero",
+  "grazing",
+  "graze",
+  "bush",
+  "hide",
+  "spy",
+  "face",
+  "halt",
+  "luggage",
+  "speed",
+  "papaya",
+  "carve",
+  "track",
+  "eight",
+  "race",
+  "compete",
+  "sprint",
+  "faucet",
+  "bath",
+  "sink",
+  "splash",
+  "shower",
+  "playing",
+  "trick",
+  "deck",
+  "card",
+  "display",
+  "marley",
+  "ride",
+  "orangatan",
+  "orangutan",
+  "shirt",
+  "strip",
+  "undres",
+  "banana",
+  "bottom",
+  "start",
+  "snowboarder",
+  "snow",
+  "hill",
+  "slope",
+  "mountain",
+  "skier",
+  "ski",
+  "snowboard",
+  "potato",
+  "potatoe",
+  "grate",
+  "foot",
+  "thumper",
+  "cartoon",
+  "love",
+  "kis",
+  "dres",
+  "costume",
+  "punch",
+  "shake",
+  "underwood",
+  "carrie",
+  "carpet",
+  "bug",
+  "floor",
+  "witha",
+  "paw",
+  "bark",
+  "child",
+  "right",
+  "sofa",
+  "crawl",
+  "infant",
+  "snuggle",
+  "nuzzle",
+  "rub",
+  "cares",
+  "kitten",
+  "shoestring",
+  "pet",
+  "touch",
+  "shih",
+  "settee",
+  "rib",
+  "spare",
+  "bone",
+  "rack",
+  "bbq",
+  "three",
+  "spar",
+  "helicopter",
+  "airplane",
+  "act",
+  "boxer",
+  "audience",
+  "routine",
+  "presentation",
+  "book",
+  "call",
+  "read",
+  "receive",
+  "attend",
+  "carrot",
+  "lori",
+  "sloth",
+  "lorus",
+  "paper",
+  "rectangular",
+  "tear",
+  "bicycle",
+  "primate",
+  "tricycle",
+  "coach",
+  "icing",
+  "cupcake",
+  "five",
+  "pom",
+  "butt",
+  "cheer",
+  "motorcyclist",
+  "rider",
+  "motorcylist",
+  "crash",
+  "fall",
+  "finishing",
+  "artist",
+  "pencil",
+  "sketch",
+  "draw",
+  "essence",
+  "cooking",
+  "corn",
+  "vanilla",
+  "drizzle",
+  "thick",
+  "loaf",
+  "brown",
+  "swat",
+  "beat",
+  "wave",
+  "shotgun",
+  "running",
+  "spoon",
+  "table",
+  "meal",
+  "din",
+  "curve",
+  "course",
+  "motocros",
+  "lake",
+  "swimming",
+  "line",
+  "shallot",
+  "foam",
+  "bathing",
+  "bettie",
+  "edge",
+  "ledge",
+  "singing",
+  "makeup",
+  "eyelid",
+  "shadow",
+  "liner",
+  "indoor",
+  "gear",
+  "scale",
+  "music",
+  "group",
+  "stamp",
+  "tomato",
+  "tomatoe",
+  "mozzarella",
+  "rice",
+  "scoop",
+  "lasagna",
+  "colander",
+  "simmer",
+  "wrestler",
+  "opponent",
+  "fighter",
+  "reporter",
+  "sandcastle",
+  "sand",
+  "castle",
+  "interview",
+  "bar",
+  "fence",
+  "horseback",
+  "obstacle",
+  "while",
+  "beach",
+  "singer",
+  "hitchhiking",
+  "hitchhiker",
+  "hitch",
+  "ask",
+  "hitchhike",
+  "eyeliner",
+  "outline",
+  "goat",
+  "bleat",
+  "tiger",
+  "lion",
+  "albino",
+  "cub",
+  "interact",
+  "carriage",
+  "family",
+  "trot",
+  "transport",
+  "travel",
+  "row",
+  "hedge",
+  "lemur",
+  "furry",
+  "weird",
+  "hug",
+  "gather",
+  "huddle",
+  "panda",
+  "keeper",
+  "bottle",
+  "veterinarian",
+  "feed",
+  "balance",
+  "scooter",
+  "back",
+  "stay",
+  "instruct",
+  "focu",
+  "game",
+  "goal",
+  "kick",
+  "shakira",
+  "metal",
+  "pillar",
+  "steel",
+  "work",
+  "envelope",
+  "xbox360",
+  "headset",
+  "mail",
+  "item",
+  "write",
+  "canoe",
+  "oanoe",
+  "newscaster",
+  "boat",
+  "paddle",
+  "sail",
+  "glove",
+  "coffee",
+  "cup",
+  "peeler",
+  "potoatoe",
+  "scrap",
+  "chair",
+  "kitchen",
+  "keyboard",
+  "synthesizer",
+  "piano",
+  "cookie",
+  "boarding",
+  "skateboard",
+  "cruise",
+  "peddle",
+  "flute",
+  "lawn",
+  "squirrel",
+  "hop",
+  "teenager",
+  "mom",
+  "teenage",
+  "carry",
+  "lounge",
+  "couple",
+  "video",
+  "girlfriend",
+  "conversation",
+  "crying",
+  "cry",
+  "mourn",
+  "weep",
+  "sob",
+  "grieve",
+  "drip",
+  "church",
+  "sunset",
+  "passenger",
+  "bcycle",
+  "crossbar",
+  "juggle",
+  "tos",
+  "soccar",
+  "sportsman",
+  "bat",
+  "cricket",
+  "error",
+  "cricketer",
+  "crowd",
+  "laugh",
+  "smile",
+  "dummy",
+  "mannequin",
+  "run",
+  "leader",
+  "exercise",
+  "chocolate",
+  "chocolaty",
+  "baker",
+  "wind",
+  "duck",
+  "toy",
+  "stream",
+  "creek",
+  "attention",
+  "squirt",
+  "lad",
+  "paste",
+  "salt",
+  "powder",
+  "dust",
+  "cantaloupe",
+  "sweet",
+  "pumpkin",
+  "cantalopue",
+  "sidewalk",
+  "sledge",
+  "sledgehammer",
+  "pick",
+  "tool",
+  "cube",
+  "hair",
+  "braid",
+  "bulldog",
+  "wad",
+  "fetch",
+  "truck",
+  "bumper",
+  "pickup",
+  "gazelle",
+  "baboon",
+  "antelope",
+  "latter",
+  "fawn",
+  "kidnap",
+  "charge",
+  "broccoli",
+  "rodent",
+  "brocoli",
+  "hamster",
+  "eye",
+  "eyeshadow",
+  "garden",
+  "amongst",
+  "stroll",
+  "bench",
+  "swing",
+  "leaf",
+  "bak",
+  "mouth",
+  "bicyclist",
+  "train",
+  "model",
+  "trolley",
+  "strike",
+  "vase",
+  "son",
+  "father",
+  "busines",
+  "mand",
+  "cleaning",
+  "wash",
+  "hoot",
+  "trainer",
+  "leg",
+  "teach",
+  "view",
+  "shield",
+  "necklace",
+  "husband",
+  "marry",
+  "flag",
+  "dinner",
+  "serve",
+  "scene",
+  "herself",
+  "background",
+  "carom",
+  "meadow",
+  "jungle",
+  "bacon",
+  "aluminum",
+  "foil",
+  "square",
+  "lamp",
+  "attract",
+  "mat",
+  "doorway",
+  "mob",
+  "salaryman",
+  "alley",
+  "cooked",
+  "rear",
+  "army",
+  "parade",
+  "formation",
+  "march",
+  "teacher",
+  "whiteboard",
+  "equation",
+  "math",
+  "formula",
+  "problem",
+  "solve",
+  "explain",
+  "circle",
+  "oven",
+  "bake",
+  "waterfall",
+  "island",
+  "creature",
+  "control",
+  "bullet",
+  "yellow",
+  "harbor",
+  "coast",
+  "speeding",
+  "policeman",
+  "officer",
+  "arrest",
+  "tie",
+  "handcuff",
+  "bikini",
+  "fold",
+  "crocodile",
+  "alligator",
+  "nose",
+  "pat",
+  "tap",
+  "curl",
+  "bodybuilder",
+  "weight",
+  "builder",
+  "barbell",
+  "night",
+  "dark",
+  "beater",
+  "spin",
+  "cage",
+  "zoo",
+  "enclosure",
+  "pen",
+  "pace",
+  "mammal",
+  "blanket",
+  "plastic",
+  "inspect",
+  "examine",
+  "bend",
+  "transplant",
+  "harvest",
+  "tend",
+  "station",
+  "subway",
+  "transit",
+  "disembark",
+  "exit",
+  "desk",
+  "rainbow",
+  "meow",
+  "nitrogen",
+  "liguid",
+  "vapor",
+  "steam",
+  "solution",
+  "pester",
+  "grab",
+  "funny",
+  "jerboa",
+  "chinchilla",
+  "mouse",
+  "rat",
+  "scurry",
+  "sergeant",
+  "seargeant",
+  "soldier",
+  "sargent",
+  "troop",
+  "commander",
+  "voice",
+  "tapir",
+  "branch",
+  "aardvark",
+  "search",
+  "forage",
+  "training",
+  "puff",
+  "squeeze",
+  "juicing",
+  "extract",
+  "strum",
+  "pizza",
+  "enjoy",
+  "ukulele",
+  "drunk",
+  "post",
+  "pavement",
+  "sieve",
+  "somone",
+  "strainer",
+  "miniature",
+  "coconut",
+  "drain",
+  "armchair",
+  "shirtles",
+  "richard",
+  "shilpa",
+  "wag",
+  "beachside",
+  "lead",
+  "cheff",
+  "tower",
+  "shape",
+  "pile",
+  "stack",
+  "quadropod",
+  "photo",
+  "photographer",
+  "tripod",
+  "spike",
+  "lug",
+  "mud",
+  "bear",
+  "digging",
+  "dig",
+  "scratch",
+  "garage",
+  "enter",
+  "shopping",
+  "shop",
+  "browse",
+  "toad",
+  "puddle",
+  "town",
+  "neighborhood",
+  "desert",
+  "steer",
+  "float",
+  "floater",
+  "raft",
+  "parrot",
+  "tabletop",
+  "beak",
+  "mug",
+  "discussion",
+  "discus",
+  "summersault",
+  "axe",
+  "bun",
+  "hamburger",
+  "sweeper",
+  "vacuum",
+  "chicken",
+  "shore",
+  "step",
+  "dancing",
+  "elephant",
+  "stick",
+  "twig",
+  "stem",
+  "bamboo",
+  "rest",
+  "dvd",
+  "disc",
+  "disk",
+  "wipe",
+  "dab",
+  "fix",
+  "synchronization",
+  "bedroom",
+  "noise",
+  "frog",
+  "catch",
+  "handle",
+  "armles",
+  "puppet",
+  "string",
+  "marionette",
+  "seat",
+  "leap",
+  "bounce",
+  "zebra",
+  "nudge",
+  "stroke",
+  "bathroom",
+  "nicholson",
+  "attendant",
+  "jacket",
+  "park",
+  "coaster",
+  "jet",
+  "plane",
+  "runway",
+  "land",
+  "slicer",
+  "mandolin",
+  "ionion",
+  "cowboy",
+  "vampire",
+  "monster",
+  "intruder",
+  "poeple",
+  "submarine",
+  "ship",
+  "missile",
+  "shuttle",
+  "space",
+  "bomb",
+  "glide",
+  "beer",
+  "midget",
+  "somebody",
+  "goblet",
+  "guiter",
+  "pole",
+  "fishing",
+  "shave",
+  "mirror",
+  "image",
+  "reflection",
+  "shark",
+  "dolphin",
+  "fisherman",
+  "bait",
+  "steal",
+  "measure",
+  "surprise",
+  "jut",
+  "defecate",
+  "skateboarder",
+  "ramp",
+  "tunnel",
+  "mower",
+  "mover",
+  "mow",
+  "lawnmower",
+  "booty",
+  "lucy",
+  "animator",
+  "butterfly",
+  "switch",
+  "twist",
+  "treadmill",
+  "twisting",
+  "attempt",
+  "halloween",
+  "jaw",
+  "skull",
+  "mimic",
+  "provide",
+  "gravel",
+  "floyd",
+  "gilmmy",
+  "ban",
+  "tiller",
+  "soil",
+  "auger",
+  "till",
+  "bot",
+  "powerboat",
+  "yacht",
+  "sea",
+  "traverse",
+  "bull",
+  "peach",
+  "avoid",
+  "halve",
+  "combine",
+  "wooden",
+  "concrete",
+  "cement",
+  "brick",
+  "stomach",
+  "bust",
+  "boxing",
+  "yoga",
+  "meditate",
+  "pray",
+  "clay",
+  "piggyback",
+  "platform",
+  "body",
+  "weigt",
+  "weightlift",
+  "stenographer",
+  "stenograph",
+  "stenography",
+  "type",
+  "cleaver",
+  "capuchin",
+  "itch",
+  "way",
+  "moose",
+  "sprinkler",
+  "wet",
+  "fore",
+  "moonwalk",
+  "hiking",
+  "reaper",
+  "skeleton",
+  "hike",
+  "welsh",
+  "scuffle",
+  "frolic",
+  "cab",
+  "taxi",
+  "argue",
+  "front",
+  "jumper",
+  "trampoline",
+  "pounce",
+  "cell",
+  "prisoner",
+  "middle",
+  "prison",
+  "jail",
+  "figure",
+  "intimidate",
+  "belly",
+  "bridge",
+  "hanging",
+  "swinging",
+  "hedgehog",
+  "porcupine",
+  "spinner",
+  "bagel",
+  "recite",
+  "slap",
+  "letter",
+  "range",
+  "spray",
+  "retain",
+  "pilot",
+  "handshake",
+  "offer",
+  "hose",
+  "friend",
+  "instructor",
+  "arrange",
+  "oxiclean",
+  "detergent",
+  "cleaner",
+  "fill",
+  "tea",
+  "basket",
+  "suck",
+  "sip",
+  "suckle",
+  "carnival",
+  "coupon",
+  "ticket",
+  "trunk",
+  "log",
+  "golf",
+  "golfing",
+  "bathtub",
+  "tub",
+  "batting",
+  "knock",
+  "stretcher",
+  "stairmaster",
+  "operate"
+]

ChatUniVi/mm_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+from transformers import StoppingCriteria
+from ChatUniVi.constants import IMAGE_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def process_images(images, image_processor, model_cfg):
+    return image_processor(images, return_tensors='pt')['pixel_values']
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

ChatUniVi/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+import requests
+from ChatUniVi.constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+    return flagged
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# SimToken: A Simple Baseline for Referring Audio-Visual Segmentation
+[![TGS](https://img.shields.io/badge/Paper-SimToken-red?logo=arXiv)](https://arxiv.org/abs/2509.17537)
+---
+## 📰 News
+[//]: # (🔥**2026.1.18**: Code are released now！)
+🔥**2026.1.18**: Our paper got accepted to **ICASSP 2026**! Thanks to all co-authors and the anonymous reviewers🎉🎉
+---
+## ⚙️ Setup
+### Datasets
+Download the official Ref-AVSBench dataset from [here](https://github.com/GeWu-Lab/Ref-AVS) and organize the dataset as follows:
+```
+./REFAVS/data
+    - /media
+    - /gt_mask
+    - /metadata.csv
+```
+### Pretrained Backbones
+Download the sam_vit_h_4b8939.pth and put it in ```./models/segment_anything```
+### Checkpoints
+Download our pretrained  **[Simtoken](https://drive.google.com/file/d/1pargYfFy93rymCANuWV0nt6Lx3Ri406l/view?usp=sharing)**.
+### Core Requirements
+This project depends on a small set of core packages. The configuration below has been tested and is recommended for stable execution.
+- `numpy`, `pandas`, `matplotlib`, `opencv`
+- `einops`, `timm`
+- `sentencepiece`
+- `transformers`, `peft`
+Newer versions of `transformers` and `peft` may introduce API changes or naming/registration conflicts that can trigger runtime errors in this project (e.g., custom model/config registration).
+To avoid such compatibility issues, we recommend **not using overly recent versions** and pin the two packages to the versions used during our development:
+- `transformers==4.30.2`
+- `peft==0.2.0`
+We also provide a complete requirements.txt for reference and easier reproduction:
+```
+pip install -r requirements.txt
+```
+---
+## 📌 Getting Started
+### Preparation
+We recommend running the following code to pre-extract audio features and visual features compatible with SAM:
+```
+python save_audio_feats.py --data_dir 'path/to/data'
+python save_sam_feats.py  --data_dir 'path/to/data'
+```
+### Train
+To train our model on Ref-AVS Bench:
+```
+python -W ignore train.py --name 'xxx' \
+    --vision_pretrained 'path/to/segment_anything/sam_vit_h_4b8939.pth' \
+    --vision_tower 'openai/clip-vit-large-patch14' \
+    --mllm 'Chat-UniVi/Chat-UniVi-7B-v1.5' \
+    --data_dir 'path/to/data'\
+    --log_root 'path/to/log_root'\
+    --checkpoint_root 'path/to/checkpoints_root'
+```
+### Test
+To test our pretrained simtoken:
+```
+python -W ignore load_model.py  --saved_model 'path/to/checkpoint.pth' \
+    --vision_pretrained 'path/to/segment_anything/sam_vit_h_4b8939.pth' \
+    --vision_tower 'openai/clip-vit-large-patch14' \
+    --mllm 'Chat-UniVi/Chat-UniVi-7B-v1.5' \
+    --data_dir 'path/to/data' \
+    --visualization_root 'path/to/visualization_root'
+```

SEG_LTPO_results.md ADDED Viewed

	@@ -0,0 +1,348 @@

+# SEG-LTPO: Experimental Results and Analysis
+---
+## Method 1: SEG-LTPO-simple (ES-based, zeroth-order)
+### Overview
+SEG-LTPO-simple performs test-time optimization of SimToken's single semantic token **Fseg** using antithetic Evolution Strategies (ES), guided by an internal reward signal that requires no ground-truth masks.
+**Optimization loop** (T=5 steps, 4 anchor frames):
+```
+eps_t ~ N(0, σ_t² I)
+F± = F_curr ± eps_t
+F_curr = F_curr + η_t · (R+ − R−) / (2σ_t²) · eps_t
+best_F = argmax_F R(F) over all evaluated candidates
+```
+**Reward function:**
+```
+R = λ1·R_temp_feat + λ2·R_iou_pred + λ3·R_align_contrast − λ4·R_area
+  = 0.3·R_temp + 0.4·R_iou + 1.0·R_align − 0.3·R_area
+```
+- **R_align_contrast**: cosine(Fseg, z_inside) − β·cosine(Fseg, z_outside); main signal
+- **R_iou_pred**: SAM's internal mask quality head output
+- **R_temp_feat**: feature-space cosine consistency between adjacent anchor frames
+- **R_area**: average foreground ratio (degenerate-mask penalty)
+**Reward gating**: accept optimized Fseg only when R(best_F) > R(F_init) + gate_delta.
+### Results (Unseen split, full 1656 samples)
+| Method | mIoU | F | Δ mIoU |
+|--------|------|---|--------|
+| Baseline | 0.6989 | 0.7927 | — |
+| Best-of-2 Random | 0.7050 (subset) → 0.7030 (full) | 0.7953 | +0.0040 |
+| SEG-LTPO-simple (ES) | **0.7050** | **0.7960** | **+0.0061** |
+> Best-of-2 and LTPO-ES results at full scale confirmed in the q-LTPO evaluation run below.
+### Key Findings
+1. **Reward signal is valid**: both Best-of-2 and ES-LTPO outperform baseline, confirming R_align_contrast provides useful signal.
+2. **ES update is noisy**: in 500-sample ablation, Best-of-2 (0.7235) slightly outperformed iterative ES (0.7228), due to extremely low SNR of single-sample gradient estimation in 256d space. At full scale (1656), ES-LTPO recovers (+0.0065 vs +0.0040), but the margin over Best-of-2 is small.
+3. **Null stability**: Null S metric change negligible (+0.00025), reward gating effectively suppresses false positives.
+---
+## Method 2: q-LTPO-autograd (first-order, Adam maximize)
+### Overview
+**Core insight from LTPO analysis**: optimize the variable that is *directly consumed* by the downstream module, using autograd rather than noisy zeroth-order estimation.
+**Three design decisions borrowed from original LTPO:**
+1. **Optimize q, not Fseg.** In SimToken+SAM, the token that directly enters the mask decoder's cross-attention is `q = sparse_emb = Fseg.unsqueeze(1)` (prompt encoder passes text_embeds through unchanged). We set `q = nn.Parameter(q_init)` and optimize q directly, bypassing the prompt encoder entirely. This requires no invertibility of ε_p — q_best is used directly for final inference.
+2. **Use autograd when reward is differentiable.** The mask decoder (transformer + MLP + matmul) is fully differentiable. With soft masks instead of hard thresholds, all reward terms are differentiable w.r.t. q. Adam maximize replaces the low-SNR score-function estimator.
+3. **Track best_q by task reward (no regularization), gate at the end.** λ_reg penalty is excluded from gating to avoid penalizing solutions that drifted slightly from q_init but achieved better task reward.
+**Stage 0: Gradient connectivity check (verified)**
+```
+grad_norm (step 0): 0.503070
+reward trajectory:  [0.4650, 0.4709, 0.4770, 0.4831, 0.4892]  ← strictly monotone
+gradient_connected: True
+```
+### Optimization loop
+```python
+q = nn.Parameter(q_init.float().detach().clone())
+optimizer = Adam([q], lr=lr_auto, maximize=True)
+best_q, best_reward = q_init.clone(), R_task(q_init)
+for step in range(T=5):
+    R_full = R_task(q) - λ_reg * ||q - q_init||²
+    R_full.backward()
+    optimizer.step()
+    clip_to_L2_ball(q, q_init, max_drift)      # hard norm constraint
+    if R_task(q) > best_reward:
+        best_q = q.clone()
+# gating
+use best_q if R_task(best_q) > R_task(q_init) + gate_delta, else q_init
+```
+**Hyperparameters (auto-scaled from q_init):**
+- `lr = 0.01 × RMS(q_init)`
+- `max_drift = 0.5 × ||q_init||`
+- `λ_reg = 0.01`, `gate_delta = 0.0`
+### Staged reward build-up
+**Stage 1** (R_iou + R_area_soft + λ_reg):
+```
+R_task = 0.6·R_iou_pred − 0.2·sigmoid(mask_logits/τ).mean()
+         where τ=5.0 (temperature to avoid sigmoid saturation)
+```
+**Stage 2** (Stage 1 + R_align_det):
+```
+R_task = 0.4·R_iou_pred + 1.0·R_align_det − 0.3·R_area_soft
+R_align_det = mean_t [ cosine(q, stopgrad(z_in^t)) − 0.5·cosine(q, stopgrad(z_out^t)) ]
+```
+z_in/z_out are stopgrad'd to avoid coupling: q first finds a mask, then moves toward the masked region's semantics.
+### Results (Unseen split)
+#### 200-sample subset (Stage 1 vs Stage 2 fair comparison, same baseline)
+| Method | mIoU | F | Δ mIoU |
+|--------|------|---|--------|
+| Baseline | 0.6749 | 0.7763 | — |
+| Best-of-2 ES | 0.6801 | 0.7803 | +0.0052 |
+| LTPO-ES | 0.6838 | 0.7826 | +0.0089 |
+| q-LTPO Stage 1 | 0.6979 | 0.7802 | +0.0230 |
+| q-LTPO Stage 2 | **0.6989** | **0.7810** | **+0.0240** |
+On 200 samples: Stage 2 marginally better than Stage 1 on both metrics.
+#### Full evaluation (Unseen, 1656 samples)
+| Method | mIoU | F | Δ mIoU vs Baseline |
+|--------|------|---|---------------------|
+| Baseline | 0.6990 | 0.7924 | — |
+| Best-of-2 ES | 0.7030 | 0.7953 | +0.0040 (+0.57%) |
+| LTPO-ES | 0.7055 | 0.7969 | +0.0065 (+0.93%) |
+| **q-LTPO Stage 1** | **0.7285** | **0.8013** | **+0.0295 (+4.22%)** |
+| q-LTPO Stage 2 | 0.7273 | 0.8002 | +0.0283 (+4.04%) |
+**Stage 1 beats Stage 2 on full eval** (opposite of 200-sample trend). R_align_det adds noise at scale: in harder Unseen samples, the initial mask quality is lower, making stopgrad z_in/z_out a less reliable target.
+### Evaluation Status (after e0 fix)
+| Split | Baseline mIoU/S | q-LTPO S1 (no e0) | q-LTPO S1 (e0) | Status |
+|-------|-----------------|-------------------|----------------|--------|
+| Unseen (1656) | 0.6990 | **0.7285** | — | Done (pre-e0) |
+| Seen (200-sample) | 0.7483 | 0.7618 (+0.0136) | **0.7634 (+0.0151)** | Quick-val done |
+| Null (200-sample, S↓) | 0.0619 | 0.0646 (+4.4%) | **0.0634 (+2.4%)** | Quick-val done |
+| Unseen (200-sample) | 0.6761 | — | **0.6929 (+0.0168)** | Quick-val done |
+| Seen (full) | — | — | — | Pending |
+| Null (full, S↓) | 0.0120 | 0.0126 (+5.0%) | — | Pending e0 run |
+| Unseen (full) | — | — | — | Pending |
+---
+## Null Safety Analysis and e0-Modulated Reward
+### Root Cause: R_iou_pred is a Conditional Quality Metric
+The original q-LTPO Stage 1 reward:
+```
+R_task = 0.6·R_iou_pred − 0.2·R_area_soft
+```
+caused Null S metric degradation (+4.4% on 200-sample quick validation, +5.0% on full Null).
+**Root cause**: `R_iou_pred` is SAM's internal mask quality head — it measures *how good the mask is given that segmentation was performed*, not *whether the target exists*. On Null frames, SAM still outputs `R_iou_pred ≈ 0.73–0.74` because it confidently segments the most prominent region (even if no audio target exists). The optimizer sees positive `R_iou_pred` and expands the mask accordingly.
+**Why oracle gating approaches fail methodologically:**
+- **Path A (gate_delta threshold)**: Distribution analysis showed Null reward_gain p50 = +0.0166 ≈ Seen p50 = +0.0181. The two distributions overlap heavily; any threshold that blocks most Null samples also blocks most Seen/Unseen samples.
+- **Path B (area-based reject rule)**: Threshold 0.02 (area fraction) was derived by observing Null mean_area = 0.0094 vs Seen mean_area = 0.054 from the test distribution. This is benchmark-specific tuning = test-set overfitting. **Not a valid method.**
+Both oracle approaches are useful for diagnostic analysis only. The principled fix must be structural.
+### Principled Fix: e0-Modulated Reward
+**Key insight**: decouple *existence* from *quality*. Use the initial mask area as a proxy for the prior probability that a real target exists.
+```python
+e0 = stopgrad( sigmoid(lrm_init / area_temp).mean() )   # R_area_soft at q_init
+R_task = λ_iou · e0 · R_iou_pred  −  λ_area · R_area_soft
+```
+**Why stopgrad on e0 is critical:**
+- Without stopgrad: gradients flow through e0 → optimizer first inflates area to increase e0, then uses the higher e0 to justify larger R_iou reward ("area gaming").
+- With stopgrad: e0 is a fixed scalar from the initialization. Gradients only flow through the explicit terms `R_iou_pred` and `R_area_soft`.
+**Effect by split:**
+| Split | mean e0 | Effective λ_iou = 0.6·e0 | Behavior |
+|-------|---------|--------------------------|----------|
+| Null | 0.037 | 0.022 | Area penalty dominates → conservative |
+| Seen | 0.120 | 0.072 | Balanced optimization |
+| Unseen | 0.150 | 0.090 | Full optimization drive |
+The 3.2× e0 ratio (Unseen/Null) arises naturally from the initial mask size, providing automatic split-specific optimization strength without any threshold tuning.
+**Implementation fix also addressed (best_q tracking bug):**
+Before fix, `q_{N+1}` (post-step) was evaluated using `lrm/iou` from `q_N` (pre-step), corrupting best_q selection. Fixed by adding a fresh `no_grad` forward after each `optimizer.step()`.
+### Quick Validation Results (200 samples each, e0 modulation)
+#### Null split (S metric, lower is better)
+| Method | S metric | Δ relative |
+|--------|----------|-----------|
+| Baseline | 0.0619 | — |
+| q-LTPO S1 (no e0) | 0.0646 | +4.4% |
+| **q-LTPO S1 (e0)** | **0.0634** | **+2.4%** |
+Diagnostic stats with e0:
+```
+acceptance rate      : 1.000
+mean e0              : 0.0372
+reward_gain p10/50/90: 0.0 / 0.0000 / +0.0123   ← p50=0 means >50% of samples frozen
+mean drift           : 0.4962                    ← down from ~0.8 without e0
+area (hard) init→best: 0.0094 → 0.0098           ← minimal area expansion
+reward↑ & area+20%↑  : 0.040                     ← low Null-safety risk
+```
+#### Seen split (mIoU, higher is better)
+| Method | mIoU | F | Δ mIoU |
+|--------|------|---|--------|
+| Baseline | 0.7483 | — | — |
+| q-LTPO S1 (no e0) | 0.7618 | — | +0.0136 |
+| **q-LTPO S1 (e0)** | **0.7634** | — | **+0.0151** |
+Diagnostic stats with e0:
+```
+mean e0              : 0.1200
+reward_gain p10/50/90: +0.0026 / +0.0181 / +0.0944
+mean drift           : 0.5225
+area (hard) init→best: 0.054 → (slight increase)
+```
+#### Unseen split (mIoU, higher is better)
+| Method | mIoU | F | Δ mIoU |
+|--------|------|---|--------|
+| Baseline | 0.6761 | 0.7776 | — |
+| **q-LTPO S1 (e0)** | **0.6929** | **0.7765** | **+0.0168** |
+Diagnostic stats with e0:
+```
+acceptance rate      : 1.000
+mean e0              : 0.1506
+reward_gain p10/50/90: +0.0011 / +0.0055 / +0.0293
+mean drift           : 0.6666
+R_iou_pred init→best : 0.8029 → 0.8802
+area (hard) init→best: 0.0635 → 0.0650
+reward↑ & area+20%↑  : 0.125
+```
+### Analysis: e0 is a Pareto Improvement
+Three conditions for Pareto improvement all satisfied on quick validation:
+1. **Null safer**: degradation halved (+4.4% → +2.4%). p50 reward_gain = 0.0000, meaning >50% of Null samples produce `best_q ≈ q_init`.
+2. **Seen maintained and slightly improved**: +0.0151 vs +0.0136 without e0.
+3. **Unseen not hurt — gains even larger**: +0.0168 > Seen +0.0151. The "harder positives suppressed" failure mode did not materialize.
+**e0 hierarchy confirms split-level discriminability:**
+```
+Null (0.037)  <<  Seen (0.120)  <  Unseen (0.150)
+```
+The ordering is sensible: Null frames have small/empty initial masks → low e0. Unseen e0 slightly exceeds Seen, possibly because the model produces slightly larger (less specific) masks on novel object-sentence combinations.
+**Residual Null degradation (+2.4%) assessment**: Acceptable for now. The absolute magnitude is +0.0015 in S metric, while Seen/Unseen absolute gains are 10–11× larger. The residual originates from a small tail of Null samples where e0 is still large enough to permit some mask expansion. Further suppression (e.g., e0², sqrt(e0+ε)) risks hurting harder positives and should only be explored after full-set confirmation.
+---
+## Summary and Comparison
+### Pre-e0 (original q-LTPO Stage 1, full Unseen)
+| Method | Unseen mIoU | Δ vs Baseline | Relative to ES-LTPO |
+|--------|-------------|---------------|----------------------|
+| Baseline | 0.6990 | — | — |
+| ES-LTPO | 0.7055 | +0.0065 | 1× |
+| **q-LTPO Stage 1** | **0.7285** | **+0.0295** | **4.5×** |
+### e0-Modulated Stage 1 (quick validation, 200 samples)
+| Split | Baseline | e0-Stage1 | Δ | e0 |
+|-------|----------|-----------|---|-----|
+| Null (S↓) | 0.0619 | 0.0634 | +2.4% (rel) | 0.037 |
+| Seen | 0.7483 | 0.7634 | +0.0151 | 0.120 |
+| Unseen | 0.6761 | 0.6929 | +0.0168 | 0.150 |
+q-LTPO-autograd with e0 modulation is the current primary method candidate. It achieves first-order gradient-based optimization with automatic Null-safety via the initial-area existence prior, without any test-set-derived thresholds.
+---
+## Hyperparameter Configurations
+### ES-LTPO (Method 1)
+```python
+LTPOConfig(
+    T=5, num_anchors=4,
+    sigma_schedule=[0.10, 0.08, 0.06, 0.04, 0.02],
+    eta_scale=0.5,
+    lambda1=0.3, lambda2=0.4, lambda3=1.0, lambda4=0.3,
+    beta=0.5, gate_delta=0.0, trust_delta=None,
+)
+```
+### q-LTPO Stage 1 with e0 (current primary candidate)
+```python
+QLTPOConfig(
+    stage=1, T=5, num_anchors=4,
+    lr=0.0,              # auto: 0.01 × RMS(q_init)
+    max_drift=0.0,       # auto: 0.5 × ||q_init||
+    lambda_iou=0.6, lambda_area=0.2,
+    lambda_reg=0.01, area_temp=5.0,
+    gate_delta=0.0,
+    e0_modulation="identity",   # e0 = R_area_soft(q_init), stopgrad
+    e0_eps=1e-4,
+    # oracle-only fields (disabled, not used in final method):
+    null_area_threshold=0.02,
+    null_gate_delta=0.0,
+)
+```
+---
+## Next Steps
+### Immediate (full-set confirmation)
+Run full evaluations with e0-modulated Stage 1 to confirm quick-validation trends at scale:
+```bash
+# Full Null (~30 min) — expect S ≈ 0.0120 + small increase, less than +5%
+TRANSFORMERS_OFFLINE=1 python -W ignore load_model.py --eval_split test_n
+# Full Seen (~35 min) — expect mIoU gain ≥ +0.013
+TRANSFORMERS_OFFLINE=1 python -W ignore load_model.py --eval_split test_s
+# Full Unseen (~35 min) — expect mIoU gain ≥ +0.025 (from pre-e0 baseline +0.0295)
+TRANSFORMERS_OFFLINE=1 python -W ignore load_model.py --eval_split test_u
+```
+**Decision criteria to promote e0-Stage1 to final method:**
+- Null S degradation < 5% relative (full set)
+- Seen mIoU gain ≥ +0.012
+- Unseen mIoU gain ≥ +0.022
+### If full-set confirms (future work)
+1. **F-score improvement (Stage 3)**: Current gain is mainly in mIoU (overlap); F-score (boundary precision/recall) lags. Candidate: boundary-oriented reward using SAM's low-res logit gradient sharpness or contour consistency across anchor frames.
+2. **Stronger e0 suppression ablation**: Test `e0_modulation="sqrt"` (g(e0) = sqrt(e0+ε)) to further compress Null tail. Only justified if full-set Null degradation exceeds 5%.
+3. **Stage 2 revisit**: R_align_det hurt at scale due to noisy z_in/z_out from low-quality initial masks. Possible fix: gate align signal by `R_iou_pred > 0.85` to only use it when initial mask is reliable.

load_model.py ADDED Viewed

	@@ -0,0 +1,877 @@

+# Compatibility: transformers==4.30.2 calls hf_hub_download(use_auth_token=...),
+# removed in huggingface_hub>=0.20.  Patch before importing transformers so the
+# bound reference inside transformers.utils.hub picks up the fixed version.
+import huggingface_hub as _hfhub
+_hfhub_orig = _hfhub.hf_hub_download
+def _hfhub_compat(*args, use_auth_token=None, token=None, **kwargs):
+    return _hfhub_orig(*args, token=token or use_auth_token, **kwargs)
+_hfhub.hf_hub_download = _hfhub_compat
+import transformers
+from torch.cuda.amp import autocast, GradScaler
+from datasets import REFAVS
+from configs import args
+from torch.utils.data import DataLoader
+from functools import partial
+from models.llava import conversation as conversation_lib
+# from  models.avs_model import VISAForCausalLM
+from  models.avs_model import Simtoken_ForCausalLM
+import torch
+from torch.cuda import amp
+from transformers import AutoConfig
+from peft import LoraConfig, get_peft_model
+from torch import optim
+from torch.optim import AdamW
+from transformers import get_cosine_schedule_with_warmup
+from tqdm import tqdm
+from utils import utility
+import random
+import numpy as np
+import re
+import time
+import os
+from PIL import Image
+import warnings
+from utils.metric.utility import mask_iou
+warnings.filterwarnings("ignore")
+from transformers import logging
+logging.set_verbosity_error()
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+AUDIO_TOKEN_INDEX = -300
+DEFAULT_AUDIO_TOKEN = "<audio>"
+def set_seed(seed=42):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def dict_to_cuda(input_dict):
+    for k, v in input_dict.items():
+        if isinstance(input_dict[k], torch.Tensor):
+            input_dict[k] = v.cuda(non_blocking=True)
+        elif (
+                isinstance(input_dict[k], list)
+                and len(input_dict[k]) > 0
+                and isinstance(input_dict[k][0], torch.Tensor)
+        ):
+            input_dict[k] = [ele.cuda(non_blocking=True) for ele in v]
+    return input_dict
+def tokenizer_image_audio_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, audio_token_index=AUDIO_TOKEN_INDEX, num_frames=10, return_tensors=None):
+    prompt_chunks = re.split(r'(<image>|<audio>|<video>)', prompt)
+    prompt_chunks = [chunk for chunk in prompt_chunks if chunk]
+    # divide prompt into two set
+    text_chunks = []  # text
+    token_types = []  # <image>/<audio>/<video>
+    for chunk in prompt_chunks:
+        if chunk == "<image>":
+            token_types.append("image")
+        elif chunk == "<audio>":
+            token_types.append("audio")
+        elif chunk == "<video>":
+            token_types.append("video")
+        else:
+            text_chunks.append(chunk)
+    # Tokenize the text
+    tokenized_chunks = [tokenizer(chunk).input_ids for chunk in text_chunks]
+    def insert_separators(text_chunks, tokenized_chunks, token_types, image_token_index, audio_token_index, num_frames):
+        input_ids = []
+        offset = 0
+        if (
+                len(tokenized_chunks) > 0
+                and len(tokenized_chunks[0]) > 0
+                and tokenized_chunks[0][0] == tokenizer.bos_token_id
+        ):
+            offset = 1
+            input_ids.append(tokenized_chunks[0][0])
+        min_length = min(len(text_chunks), len(token_types))
+        for i in range(min_length):
+            input_ids.extend(tokenized_chunks[i][offset:])
+            if token_types[i] == "image":
+                input_ids.append(image_token_index)
+            elif token_types[i] == "audio":
+                input_ids.append(audio_token_index)
+            elif token_types[i] == "video":
+                input_ids.extend([image_token_index] * num_frames)
+        if len(text_chunks) > min_length:
+            input_ids.extend(tokenized_chunks[min_length][offset:])
+        return input_ids
+    input_ids = insert_separators(text_chunks, tokenized_chunks, token_types, image_token_index, audio_token_index, num_frames)
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def collate_fn(batch, tokenizer=None):
+    vids = []
+    images = []
+    image_clips = []
+    masks = []
+    conversations = []
+    audio_feats = []
+    image_feats = []
+    resizes = []
+    orgsizes = []
+    first_refs = []
+    refs = []
+    first_refs = []
+    refs_num = []
+    fids = []
+    for data in batch:
+        vids.append(data['vid'])
+        images.append(data['image'])
+        image_clips.append(data['img_clip'])
+        masks.append(data['mask'])
+        conversations.append(data['conversation'])
+        audio_feats.append(data['feat_aud'])
+        resizes.append(data['resize'])
+        orgsizes.append(data['orgsize'])
+        image_feats.append(data['feat_sam'])
+        refs_num.append(len(data['ref']))
+        fids.append(data['fids'])
+        refs.append(data['ref'])
+        first_refs.append(data['ref'][0])
+    input_ids = [tokenizer_image_audio_token(conv, tokenizer, return_tensors="pt") for conv in conversations]  # list
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+    attention_masks = input_ids.ne(tokenizer.pad_token_id)
+    ref_ids = [tokenizer_image_audio_token(ref, tokenizer, return_tensors="pt") for ref in first_refs]
+    conv = conversation_lib.default_conversation.copy()
+    labels = input_ids.clone()
+    sep = 'Sure, it is [SEG]'
+    for conversation, target in zip(conversations, labels):
+        parts = conversation.split(sep)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        sep_len = len(tokenizer_image_audio_token(sep, tokenizer)) - 1
+        for i in range(len(parts)-1):
+            part_len = len(tokenizer_image_audio_token(parts[i], tokenizer)) - 2
+            target[cur_len: cur_len + part_len] = IGNORE_INDEX
+            cur_len += part_len + sep_len
+        target[cur_len:] = IGNORE_INDEX
+    return {"vids": vids,
+            "images": images,  # list[B]:[T, 3, 1024, 1024]
+            "images_clip": image_clips,  # list[B]:[T, 3, 224, 224]
+            "masks": masks,  # list[B]:[num_ref, T, H, W]
+            "convs": conversations,  # list[B]: str
+            "input_ids": input_ids,  # list[B]:[max_len]
+            "attention_masks": attention_masks,  # list[B]:[max_len]
+            "labels": labels,  # list[B]:[max_len]
+            "audio_feats": audio_feats,  # list[B]:[10, 128]
+            "resizes": resizes,  # list[B]
+            "orgsizes": orgsizes,  # list[B]
+            "image_feats": image_feats,
+            "ref_ids": ref_ids,  # list[B]: [ref_id_len]
+            "refs_num": refs_num,
+            "fids": fids,
+            "refs": refs,
+    }
+import torch.multiprocessing as mp
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    set_seed(42)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.mllm,
+        cache_dir=None,
+        model_max_length=2048,  # 2048
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]  # 32000
+    print("seg_token_idx: ", seg_token_idx)
+    _split = args.eval_split
+    _dataset = REFAVS(_split, args, tokenizer, input_type='refer')
+    _dataloader = DataLoader(_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=partial(collate_fn, tokenizer=tokenizer))
+    model_args = {
+        "train_mask_decoder": True,
+        "out_dim": 256,  # 256
+        "ce_loss_weight": 1.0,
+        "dice_loss_weight": 0.5,
+        "bce_loss_weight": 2.0,
+        "seg_token_idx": seg_token_idx,
+        "vision_pretrained": args.vision_pretrained,  # sam_vit_h_xxx.pth
+        "vision_tower": args.vision_tower,
+        "use_im_start_end": False,
+        "compress": args.compress,
+        "start": args.start,
+    }
+    # model = Simtoken_ForCausalLM.from_pretrained(args.mllm, torch_dtype=torch.float32, low_cpu_mem_usage=True, **model_args)
+    model = Simtoken_ForCausalLM.from_pretrained(args.mllm, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
+                                                 **model_args)
+    print("\nmodel loaded")
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.enable_input_require_grads()
+    model.gradient_checkpointing_enable()
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch.float32, device="cuda")
+    model_args_from_pt = AutoConfig.from_pretrained(args.mllm)
+    model_args_from_pt.use_cluster = True
+    model_args_from_pt.freeze = False
+    model_args_from_pt.mm_tune = True
+    model_args_from_pt.spatial_cluster_rate0 = 64
+    model_args_from_pt.spatial_cluster_rate1 = 32
+    model_args_from_pt.spatial_cluster_rate2 = 16
+    model_args_from_pt.temporal_cluster_rate = 0.0625
+    model_args_from_pt.use_cluster = True
+    model_args_from_pt.vision_tune = False
+    model.get_model().initialize_cluster_modules(model_args_from_pt)
+    model.get_model().initialize_lisa_modules(model.get_model().config)
+    for p in vision_tower.parameters():
+        p.requires_grad = False
+    for p in model.get_model().mm_projector.parameters():
+        p.requires_grad = False
+    lora_r = 8
+    target_modules = "q_proj,v_proj"
+    if lora_r > 0:
+        def find_linear_layers(model, lora_target_modules):
+            cls = torch.nn.Linear
+            lora_module_names = set()
+            for name, module in model.named_modules():
+                if (
+                        isinstance(module, cls)
+                        and all(
+                    [
+                        x not in name
+                        for x in [
+                        "visual_model",
+                        "vision_tower",
+                        "mm_projector",
+                        "text_hidden_fcs",
+                        "audio_feature_layer",
+                    ]
+                    ]
+                )
+                        and any([x in name for x in lora_target_modules])
+                ):
+                    lora_module_names.add(name)
+            return sorted(list(lora_module_names))
+        lora_alpha = 16
+        lora_dropout = 0.05
+        lora_target_modules = find_linear_layers(
+            model, target_modules.split(",")
+        )
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_target_modules,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+        print("\nLora deployed")
+        model.print_trainable_parameters()
+    model = model.to("cuda")
+    model.resize_token_embeddings(len(tokenizer))
+    model.load_state_dict(torch.load(args.saved_model), strict=False)
+    print("saved model loaded")
+    save_root = args.visualization_root
+    def visualization(model, dataloader, save_root, name):
+        save_root = os.path.join(save_root, name)
+        os.makedirs(save_root, exist_ok=True)
+        print(f"save_root: {save_root}")
+        model.eval()
+        for batch in tqdm(dataloader, desc=f"Visualization on {name} "):
+            input_dict = dict_to_cuda(batch)
+            with torch.no_grad():
+                output_dict = model.forward(images=input_dict["images"],
+                                            images_clip=input_dict["images_clip"],
+                                            audio_features=input_dict["audio_feats"],
+                                            image_features=input_dict["image_feats"],
+                                            input_ids=input_dict["input_ids"],
+                                            labels=input_dict["labels"],
+                                            attention_masks=input_dict["attention_masks"],
+                                            masks_list=input_dict["masks"],
+                                            resize_list=input_dict["resizes"],
+                                            orgsize_list=input_dict["orgsizes"],
+                                            conversation_list=input_dict["convs"],
+                                            refs_num=input_dict["refs_num"],
+                                            fids=input_dict["fids"],
+                                            vids=input_dict["vids"],
+                                            contrast=args.ct_weight,
+                                            ref_ids=input_dict["ref_ids"],
+                                            inference=True)
+            pred_masks = output_dict["pred_masks"]  # list[B]:[num_seg, T, H, W]
+            gt_masks = output_dict["gt_masks"]  # list[B]:[num_seg, T, H, W]
+            for b in range(len(pred_masks)):
+                sample = torch.sigmoid(pred_masks[b])  # [num_seg, T, H, W]
+                vid = input_dict["vids"][b]
+                vid_root = os.path.join(save_root, vid)
+                os.makedirs(vid_root, exist_ok=True)
+                # print("vid_root:", vid_root)
+                binary_sample = (sample > 0.4).to(torch.uint8)
+                num_seg, T, H, W = sample.shape
+                for seg_idx in range(num_seg):
+                    ref = input_dict["refs"][b][seg_idx]
+                    ref_root = os.path.join(vid_root, ref)
+                    os.makedirs(ref_root, exist_ok=True)
+                    # print("ref_root:", ref_root)
+                    for t in range(T):
+                        mask_np = binary_sample[seg_idx, t].cpu().numpy() * 255
+                        mask_img = Image.fromarray(mask_np.astype(np.uint8))
+                        save_path = os.path.join(ref_root, f"frame{t}.png")
+                        mask_img.save(save_path)
+                        # print(f"image saved as {save_path}")
+        print("visualization finished")
+    def valuate(model, dataloader, name, max_rows=-1):
+        model.eval()
+        total_iou = 0
+        total_fscore = 0
+        count = 0
+        _total = min(max_rows, len(dataloader)) if max_rows > 0 else len(dataloader)
+        for i, batch in enumerate(tqdm(dataloader, desc=f"Evaluating on {name}", total=_total)):
+            if 0 < max_rows <= i:
+                break
+            input_dict = dict_to_cuda(batch)
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
+                with torch.no_grad():
+                    output_dict = model.forward(images=input_dict["images"],
+                                                images_clip=input_dict["images_clip"],
+                                                audio_features=input_dict["audio_feats"],
+                                                image_features=input_dict["image_feats"],
+                                                input_ids=input_dict["input_ids"],
+                                                labels=input_dict["labels"],
+                                                attention_masks=input_dict["attention_masks"],
+                                                masks_list=input_dict["masks"],
+                                                resize_list=input_dict["resizes"],
+                                                orgsize_list=input_dict["orgsizes"],
+                                                conversation_list=input_dict["convs"],
+                                                refs_num=input_dict["refs_num"],
+                                                fids=input_dict["fids"],
+                                                vids=input_dict["vids"],
+                                                contrast=args.ct_weight,
+                                                ref_ids=input_dict["ref_ids"],
+                                                inference=True)
+            pred_masks = output_dict["pred_masks"]  # list[B]:[num_seg, T, H, W]
+            gt_masks = output_dict["gt_masks"]  # list[B]:[num_seg, T, H, W]
+            for i in range(len(pred_masks)):
+                num_seg = pred_masks[i].shape[0]
+                T = pred_masks[i].shape[1]
+                iou = utility.mask_iou(pred_masks[i], gt_masks[i])
+                fscore = utility.Eval_Fmeasure(pred_masks[i], gt_masks[i], None)
+                total_iou += iou * num_seg * T
+                total_fscore += fscore * num_seg * T
+                count += num_seg * T
+        print(f"\n  valuate on {name}:  miou: {total_iou/count}  fscore: {total_fscore/count}")
+    def valuate_Null(model, dataloader, max_rows=-1):
+        model.eval()
+        total_metric = 0
+        count = 0
+        _total = min(max_rows, len(dataloader)) if max_rows > 0 else len(dataloader)
+        for i, batch in enumerate(tqdm(dataloader, desc=f"Evaluating on Null", total=_total)):
+            if 0 < max_rows <= i:
+                break
+            input_dict = dict_to_cuda(batch)
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
+                with torch.no_grad():
+                    output_dict = model.forward(images=input_dict["images"],
+                                                images_clip=input_dict["images_clip"],
+                                                audio_features=input_dict["audio_feats"],
+                                                image_features=input_dict["image_feats"],
+                                                input_ids=input_dict["input_ids"],
+                                                labels=input_dict["labels"],
+                                                attention_masks=input_dict["attention_masks"],
+                                                masks_list=input_dict["masks"],
+                                                resize_list=input_dict["resizes"],
+                                                orgsize_list=input_dict["orgsizes"],
+                                                conversation_list=input_dict["convs"],
+                                                refs_num=input_dict["refs_num"],
+                                                fids=input_dict["fids"],
+                                                vids=input_dict["vids"],
+                                                contrast=args.ct_weight,
+                                                ref_ids=input_dict["ref_ids"],
+                                                inference=True)
+            pred_masks = output_dict["pred_masks"]  # list[B]:[num_seg, T, H, W]
+            gt_masks = output_dict["gt_masks"]  # list[B]:[num_seg, T, H, W]
+            for i in range(len(pred_masks)):
+                num_seg = pred_masks[i].shape[0]
+                T = pred_masks[i].shape[1]
+                null_metric = utility.metric_s_for_null(pred_masks[i])
+                total_metric += null_metric * num_seg * T
+                count += num_seg * T
+        print(f"\n  valuate on test_n_refer, metric: {total_metric / count}")
+    from seg_ltpo import (
+        LTPOConfig, ltpo_optimize, best_of_2_optimize, decode_full_video,
+        get_sam_model, get_anchor_indices,
+        QLTPOConfig, q_ltpo_autograd, check_grad_connectivity,
+        reset_q_ltpo_stats, get_q_ltpo_stats,
+    )
+    def print_q_ltpo_stats(name: str) -> None:
+        stats = get_q_ltpo_stats()
+        if not stats:
+            return
+        n = len(stats)
+        acc_rate       = sum(s["accepted"]         for s in stats) / n
+        mean_gain      = sum(s["reward_gain"]       for s in stats) / n
+        mean_drift     = sum(s["drift"]             for s in stats) / n
+        clip_rate      = sum(s["hit_clip"]          for s in stats) / n
+        mean_iou_init  = sum(s["R_iou_pred_init"]   for s in stats) / n
+        mean_iou_best  = sum(s["R_iou_pred_best"]   for s in stats) / n
+        mean_area_init = sum(s["area_hard_init"]    for s in stats) / n
+        mean_area_best = sum(s["area_hard_best"]    for s in stats) / n
+        # Null safety: reward improved but predicted area grew >20 %
+        null_risk = sum(
+            1 for s in stats
+            if s["reward_gain"] > 0 and s["area_hard_best"] > s["area_hard_init"] * 1.2
+        ) / n
+        gains = sorted(s["reward_gain"] for s in stats)
+        def _pct(v, p): return v[max(0, int(len(v) * p / 100) - 1)]
+        mean_e0 = sum(s["e0"] for s in stats) / n
+        print(f"\n  [q-LTPO stats | {name} | n={n}]")
+        print(f"    acceptance rate      : {acc_rate:.3f}")
+        print(f"    mean e0 (exist prior): {mean_e0:.4f}  ← should differ Null vs Seen")
+        print(f"    mean reward gain     : {mean_gain:+.4f}")
+        print(f"    reward_gain p10/50/90: {_pct(gains,10):+.4f} / {_pct(gains,50):+.4f} / {_pct(gains,90):+.4f}")
+        print(f"    mean drift ‖q−q₀‖   : {mean_drift:.4f}")
+        print(f"    hit-clip ratio       : {clip_rate:.3f}")
+        print(f"    R_iou_pred init→best : {mean_iou_init:.4f} → {mean_iou_best:.4f}")
+        print(f"    area (hard) init→best: {mean_area_init:.4f} → {mean_area_best:.4f}")
+        print(f"    reward↑ & area+20%↑  : {null_risk:.3f}  ← Null safety indicator")
+    def valuate_ltpo(model, dataloader, name, ltpo_cfg, optimize_fn=None, max_rows=-1):
+        if optimize_fn is None:
+            optimize_fn = ltpo_optimize
+        """
+        Evaluate with SEG-LTPO-simple test-time optimisation.
+        For each sample:
+          1. Run the standard SimToken forward pass once to get initial Fseg.
+          2. Optimise Fseg on 4 anchor frames using antithetic ES (5 steps).
+          3. Decode the full video with the best Fseg found.
+          4. Fall back to the original Fseg when reward gating rejects the update.
+        """
+        model.eval()
+        sam_model    = get_sam_model(model)
+        model_dtype  = torch.bfloat16
+        num_frames   = 10
+        anchor_indices = get_anchor_indices(num_frames, ltpo_cfg.num_anchors)
+        total_iou    = 0
+        total_fscore = 0
+        count        = 0
+        _total = min(max_rows, len(dataloader)) if max_rows > 0 else len(dataloader)
+        for i, batch in enumerate(tqdm(dataloader, desc=f"LTPO Evaluating on {name}", total=_total)):
+            if 0 < max_rows <= i:
+                break
+            input_dict = dict_to_cuda(batch)
+            # ── Step 1: standard forward pass (LLM + SAM decode) ──────────
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
+                with torch.no_grad():
+                    output_dict = model.forward(
+                        images=input_dict["images"],
+                        images_clip=input_dict["images_clip"],
+                        audio_features=input_dict["audio_feats"],
+                        image_features=input_dict["image_feats"],
+                        input_ids=input_dict["input_ids"],
+                        labels=input_dict["labels"],
+                        attention_masks=input_dict["attention_masks"],
+                        masks_list=input_dict["masks"],
+                        resize_list=input_dict["resizes"],
+                        orgsize_list=input_dict["orgsizes"],
+                        conversation_list=input_dict["convs"],
+                        refs_num=input_dict["refs_num"],
+                        fids=input_dict["fids"],
+                        vids=input_dict["vids"],
+                        contrast=args.ct_weight,
+                        ref_ids=input_dict["ref_ids"],
+                        inference=True,
+                    )
+            gt_masks          = output_dict["gt_masks"]         # list[B]:[num_seg, T, H, W]
+            seg_emb_list      = output_dict["seg_embeddings"]   # list[B]:[num_seg, 256]
+            for b in range(len(input_dict["images"])):
+                image_embeds_b = input_dict["image_feats"][b]   # [T, 256, 64, 64]
+                resize_b       = input_dict["resizes"][b]
+                orgsize_b      = input_dict["orgsizes"][b]
+                # Convert initial Fseg to float32 for stable optimisation.
+                # seg_emb_list[b]: [num_seg, 256] in bfloat16
+                F_init_b = seg_emb_list[b].detach().float()     # [num_seg, 256]
+                pred_masks_ltpo = []
+                for seg_idx in range(F_init_b.shape[0]):
+                    fseg_init = F_init_b[seg_idx : seg_idx + 1]  # [1, 256]
+                    # ── Step 2: optimisation (float32, outside autocast) ──────
+                    best_fseg = optimize_fn(
+                        fseg_init, image_embeds_b, anchor_indices,
+                        sam_model, model_dtype, ltpo_cfg,
+                    )  # [1, 256] float32
+                    # ── Step 3: decode full video with best Fseg ──────────────
+                    pred_mask = decode_full_video(
+                        best_fseg, image_embeds_b, sam_model,
+                        resize_b, orgsize_b, model_dtype,
+                    )  # [T, H, W]
+                    pred_masks_ltpo.append(pred_mask)
+                pred_masks_b = torch.stack(pred_masks_ltpo, dim=0)  # [num_seg, T, H, W]
+                num_seg = pred_masks_b.shape[0]
+                T_      = pred_masks_b.shape[1]
+                iou     = utility.mask_iou(pred_masks_b, gt_masks[b])
+                fscore  = utility.Eval_Fmeasure(pred_masks_b, gt_masks[b], None)
+                total_iou    += iou    * num_seg * T_
+                total_fscore += fscore * num_seg * T_
+                count        += num_seg * T_
+        print(f"\n  LTPO valuate on {name}:  miou: {total_iou/count:.4f}  fscore: {total_fscore/count:.4f}")
+    def valuate_ltpo_null(model, dataloader, ltpo_cfg, optimize_fn=None, max_rows=-1):
+        if optimize_fn is None:
+            optimize_fn = ltpo_optimize
+        """LTPO evaluation for Null split: measures S metric (lower = fewer false-positive masks)."""
+        model.eval()
+        sam_model      = get_sam_model(model)
+        model_dtype    = torch.bfloat16
+        num_frames     = 10
+        anchor_indices = get_anchor_indices(num_frames, ltpo_cfg.num_anchors)
+        total_metric = 0
+        count        = 0
+        _total = min(max_rows, len(dataloader)) if max_rows > 0 else len(dataloader)
+        for i, batch in enumerate(tqdm(dataloader, desc="LTPO Evaluating on Null", total=_total)):
+            if 0 < max_rows <= i:
+                break
+            input_dict = dict_to_cuda(batch)
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
+                with torch.no_grad():
+                    output_dict = model.forward(
+                        images=input_dict["images"],
+                        images_clip=input_dict["images_clip"],
+                        audio_features=input_dict["audio_feats"],
+                        image_features=input_dict["image_feats"],
+                        input_ids=input_dict["input_ids"],
+                        labels=input_dict["labels"],
+                        attention_masks=input_dict["attention_masks"],
+                        masks_list=input_dict["masks"],
+                        resize_list=input_dict["resizes"],
+                        orgsize_list=input_dict["orgsizes"],
+                        conversation_list=input_dict["convs"],
+                        refs_num=input_dict["refs_num"],
+                        fids=input_dict["fids"],
+                        vids=input_dict["vids"],
+                        contrast=args.ct_weight,
+                        ref_ids=input_dict["ref_ids"],
+                        inference=True,
+                    )
+            seg_emb_list = output_dict["seg_embeddings"]  # list[B]:[num_seg, 256]
+            for b in range(len(input_dict["images"])):
+                image_embeds_b = input_dict["image_feats"][b]
+                resize_b       = input_dict["resizes"][b]
+                orgsize_b      = input_dict["orgsizes"][b]
+                F_init_b       = seg_emb_list[b].detach().float()
+                pred_masks_ltpo = []
+                for seg_idx in range(F_init_b.shape[0]):
+                    fseg_init = F_init_b[seg_idx : seg_idx + 1]
+                    best_fseg = optimize_fn(
+                        fseg_init, image_embeds_b, anchor_indices,
+                        sam_model, model_dtype, ltpo_cfg,
+                    )
+                    pred_mask = decode_full_video(
+                        best_fseg, image_embeds_b, sam_model,
+                        resize_b, orgsize_b, model_dtype,
+                    )
+                    pred_masks_ltpo.append(pred_mask)
+                pred_masks_b = torch.stack(pred_masks_ltpo, dim=0)  # [num_seg, T, H, W]
+                num_seg      = pred_masks_b.shape[0]
+                T_           = pred_masks_b.shape[1]
+                null_metric  = utility.metric_s_for_null(pred_masks_b)
+                total_metric += null_metric * num_seg * T_
+                count        += num_seg * T_
+        print(f"\n  LTPO valuate on Null:  S metric: {total_metric/count:.4f}")
+    # ── Stage 0: gradient connectivity check ─────────────────────────────
+    # Loads one image_embed directly from disk — no dataloader, no gt_mask,
+    # no media frames required.  F_init is a unit-scale random vector that
+    # mimics the distribution of Fseg (SAM prompt embeddings are in ℝ^256
+    # with per-dim std ≈ 0.05–0.3; we use std=0.1 as a neutral initialisation).
+    def run_stage0_check():
+        import glob
+        sam_model   = get_sam_model(model)
+        model_dtype = torch.bfloat16
+        embed_files = sorted(glob.glob(os.path.join(args.data_dir, "image_embed", "*.pt")))
+        if not embed_files:
+            print("[Stage 0] ERROR: no .pt files found in data/image_embed/")
+            return False
+        img_embs = torch.load(embed_files[0], map_location="cuda")  # [T, 256, 64, 64]
+        if img_embs.dim() == 3:                                      # [256,64,64] → [1,256,64,64]
+            img_embs = img_embs.unsqueeze(0)
+        torch.manual_seed(42)
+        F_init = torch.randn(1, 256, device="cuda") * 0.1           # [1, 256] float32
+        anchors = get_anchor_indices(img_embs.shape[0], 4)
+        diag = check_grad_connectivity(F_init, img_embs, anchors, sam_model, model_dtype)
+        print("\n[Stage 0] Gradient connectivity check:")
+        print(f"  file used          : {os.path.basename(embed_files[0])}")
+        print(f"  gradient_connected : {diag['gradient_connected']}")
+        print(f"  grad_norm (step 0) : {diag['grad_norm_step0']:.6f}")
+        print(f"  reward trajectory  : {[f'{r:.4f}' for r in diag['reward_trajectory']]}")
+        return diag["gradient_connected"]
+    # ── Bypass equivalence test ───────────────────────────────────────────
+    # Three controlled tests to verify that fseg.unsqueeze(1) (bypass) is
+    # numerically equivalent to prompt_encoder(text_embeds=fseg.unsqueeze(1)):
+    #   Test 1 — dense_emb dtype: dense_A.to(bfloat16) vs dense_emb_bf16 (exact 0?)
+    #   Test 2 — matched-prec anchor decode: same decoder, same inputs, both bfloat16
+    #   Test 3 — full-video (all T frames) matched-prec decode
+    # If all pass, delta_bypass_init = 0 and the +4.22% is purely from optimization.
+    def run_bypass_test():
+        from seg_ltpo import _precompute_dense_emb
+        sam_model   = get_sam_model(model)
+        pe          = sam_model.prompt_encoder
+        mask_dec    = sam_model.mask_decoder
+        model_dtype = torch.bfloat16
+        # Get one real Fseg via a standard forward pass on the first batch
+        batch = next(iter(_dataloader))
+        input_dict = dict_to_cuda(batch)
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
+            with torch.no_grad():
+                output_dict = model.forward(
+                    images=input_dict["images"],
+                    images_clip=input_dict["images_clip"],
+                    audio_features=input_dict["audio_feats"],
+                    image_features=input_dict["image_feats"],
+                    input_ids=input_dict["input_ids"],
+                    labels=input_dict["labels"],
+                    attention_masks=input_dict["attention_masks"],
+                    masks_list=input_dict["masks"],
+                    resize_list=input_dict["resizes"],
+                    orgsize_list=input_dict["orgsizes"],
+                    conversation_list=input_dict["convs"],
+                    refs_num=input_dict["refs_num"],
+                    fids=input_dict["fids"],
+                    vids=input_dict["vids"],
+                    contrast=args.ct_weight,
+                    ref_ids=input_dict["ref_ids"],
+                    inference=True,
+                )
+        fseg         = output_dict["seg_embeddings"][0][0:1].detach()  # [1,256] bfloat16
+        image_embeds = input_dict["image_feats"][0]                    # [T,256,64,64] float32
+        device       = fseg.device
+        anchor_indices = get_anchor_indices(image_embeds.shape[0], 4)
+        img_anc        = image_embeds[anchor_indices]                          # [A,256,64,64] float32
+        dense_emb_bf16 = _precompute_dense_emb(sam_model, model_dtype, device)  # [1,256,64,64] bfloat16
+        dense_pe       = pe.get_dense_pe().to(device)                          # float32
+        def _decode(img, sparse_emb, dense_emb):
+            return mask_dec(
+                image_embeddings=img,
+                image_pe=dense_pe,
+                sparse_prompt_embeddings=sparse_emb,
+                dense_prompt_embeddings=dense_emb,
+                multimask_output=False,
+            )
+        def _check(label, tensor_a, tensor_b, exact=False):
+            err = (tensor_a.float() - tensor_b.float()).abs().max().item()
+            tol = 0.0 if exact else 1e-4
+            status = "PASS" if err <= tol else "FAIL"
+            print(f"  [{status}]  {label:50s}  max|A-B| = {err:.2e}")
+            return err <= tol
+        print(f"\n[Bypass Test]  fseg dtype={fseg.dtype}  norm={fseg.float().norm().item():.4f}")
+        with torch.no_grad():
+            # Get prompt_encoder outputs (called outside autocast → float32)
+            sparse_A, dense_A = pe(points=None, boxes=None, masks=None,
+                                   text_embeds=fseg.unsqueeze(1))
+            sparse_B = fseg.unsqueeze(1)   # bypass sparse: identical tensor
+            # ── Test 1: dense_emb dtype artifact ────────────────────────────────
+            # Hypothesis: dense_A (float32) and dense_emb_bf16 differ only because
+            # no_mask_embed.weight is float32; casting to bfloat16 should give exact 0.
+            print("\n  [Test 1] dense_emb dtype artifact (expected: exact 0)")
+            t1 = _check("dense_A.to(bfloat16) vs dense_emb_bf16",
+                        dense_A.to(torch.bfloat16), dense_emb_bf16, exact=True)
+            # ── Test 2: matched-precision decode on anchors ──────────────────────
+            # Both paths use bfloat16 sparse + bfloat16 dense.
+            # If sparse_emb is identical and dense_emb is identical (per Test 1),
+            # masks and iou_preds must be identical (same decoder, same inputs).
+            print("\n  [Test 2] matched-precision anchor decode (expected: exact 0)")
+            dense_A_bf16 = dense_A.to(model_dtype)
+            masks_A, iou_A = _decode(img_anc, sparse_A,  dense_A_bf16)
+            masks_B, iou_B = _decode(img_anc, sparse_B,  dense_emb_bf16)
+            _check("sparse_emb",                    sparse_A, sparse_B,  exact=True)
+            t2m = _check("masks   (anchors, matched prec)", masks_A,  masks_B,  exact=True)
+            t2i = _check("iou_preds (anchors, matched prec)", iou_A,  iou_B,    exact=True)
+            t2  = t2m and t2i
+            # ── Test 3: full-video bypass-init baseline (all T frames) ──────────
+            # Extend Test 2 to all T frames; quantifies delta_bypass_init over
+            # the complete video rather than just the 4 anchor frames.
+            print(f"\n  [Test 3] full-video matched-precision decode (T={image_embeds.shape[0]} frames)")
+            masks_full_A, _ = _decode(image_embeds, sparse_A,  dense_A_bf16)
+            masks_full_B, _ = _decode(image_embeds, sparse_B,  dense_emb_bf16)
+            t3 = _check("masks (all frames, matched prec)", masks_full_A, masks_full_B, exact=True)
+        print("\n  ── Verdict ──────────────────────────────────────────────────────")
+        if t1 and t2 and t3:
+            print("  ALL PASS — bypass is algebraically and numerically equivalent to")
+            print("  prompt_encoder path under matched precision.  delta_bypass_init = 0.")
+            print("  The +4.22% mIoU improvement is purely from q-LTPO optimization.")
+        else:
+            failures = []
+            if not t1: failures.append("Test 1 (dense dtype)")
+            if not t2: failures.append("Test 2 (anchor decode)")
+            if not t3: failures.append("Test 3 (full-video decode)")
+            print(f"  FAIL in: {', '.join(failures)}")
+            print("  delta_bypass_init ≠ 0; need per-sample mIoU comparison to quantify.")
+    # ── Run evaluation ────────────────────────────────────────────────────
+    ltpo_cfg   = LTPOConfig()
+    q_ltpo_cfg_s1 = QLTPOConfig(stage=1)
+    q_ltpo_cfg_s2 = QLTPOConfig(stage=2)
+    max_rows   = args.max_eval_rows     # -1 = all rows
+    # --max_eval_rows 0  → Stage 0 + bypass equivalence check, then exit
+    if max_rows == 0:
+        run_stage0_check()
+        run_bypass_test()
+    elif _split == 'test_n':
+        # Safety check: Baseline vs q-LTPO Stage 1 only.
+        # ES-LTPO / Stage 2 are omitted — ES is no longer the primary method,
+        # and Stage 2 consistently underperforms Stage 1.  If Stage 1 shows
+        # notable deterioration here, add a small Best-of-2 ES subset run to
+        # distinguish "reward unsafe on Null" from "autograd more aggressive".
+        valuate_Null(model, _dataloader, max_rows=max_rows)
+        reset_q_ltpo_stats()
+        valuate_ltpo_null(model, _dataloader, q_ltpo_cfg_s1,
+                          optimize_fn=q_ltpo_autograd,     max_rows=max_rows)
+        print_q_ltpo_stats("null_q_ltpo_s1")
+    else:
+        # Baseline + q-LTPO Stage 1 only.  ES series omitted — q-autograd is
+        # the primary method; Stage 2 consistently underperforms Stage 1.
+        valuate(model, _dataloader, _split, max_rows=max_rows)
+        reset_q_ltpo_stats()
+        valuate_ltpo(model, _dataloader, f'{_split}_q_ltpo_s1',  q_ltpo_cfg_s1,
+                     optimize_fn=q_ltpo_autograd,     max_rows=max_rows)
+        print_q_ltpo_stats(f'{_split}_q_ltpo_s1')

requirements.txt ADDED Viewed

	@@ -0,0 +1,76 @@

+accelerate==1.12.0
+anyio==4.12.1
+certifi==2026.1.4
+charset-normalizer==3.4.4
+click==8.3.1
+contourpy @ file:///croot/contourpy_1732540045555/work
+cuda-bindings==12.9.4
+cuda-pathfinder==1.2.2
+cycler @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_cycler_1764466758/work
+einops==0.8.2
+exceptiongroup==1.3.1
+filelock==3.20.0
+fonttools @ file:///home/conda/feedstock_root/build_artifacts/fonttools_1765632549069/work
+fsspec==2025.12.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+kiwisolver @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_kiwisolver_1762488741/work
+MarkupSafe==2.1.5
+matplotlib==3.10.8
+mpmath==1.3.0
+munkres==1.1.4
+networkx==3.4.2
+numpy @ file:///home/conda/feedstock_root/build_artifacts/numpy_1747544640217/work/dist/numpy-2.2.6-cp310-cp310-linux_x86_64.whl#sha256=d6d964caeef85d00073d27cd62b46883d275b3d8162f723f0fcabbd0b3cc3f9d
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvshmem-cu12==3.4.5
+nvidia-nvtx-cu12==12.6.77
+opencv-python==4.12.0
+opencv-python-headless==4.12.0
+packaging @ file:///home/task_176104877067765/conda-bld/packaging_1761049113113/work
+pandas @ file:///home/conda/feedstock_root/build_artifacts/pandas_1764614464206/work
+peft==0.2.0
+pillow @ file:///home/task_176252756305210/conda-bld/pillow_1762528238325/work
+psutil==7.2.2
+pyparsing @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_pyparsing_1769003998/work
+PySide6==6.9.2
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_python-dateutil_1751104122/work
+pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1742920838005/work
+PyYAML==6.0.3
+regex==2026.1.15
+requests==2.32.5
+safetensors==0.7.0
+sentencepiece==0.2.1
+shellingham==1.5.4
+shiboken6==6.9.2
+six @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_six_1753199211/work
+sympy==1.14.0
+timm==1.0.24
+tokenizers==0.13.3
+torch==2.10.0+cu126
+torchvision==0.25.0+cu126
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1666788589303/work
+tqdm==4.67.2
+transformers==4.30.2
+triton==3.6.0
+typer-slim==0.21.1
+typing_extensions==4.15.0
+tzdata @ file:///home/conda/feedstock_root/build_artifacts/python-tzdata_1765719872007/work
+unicodedata2 @ file:///home/conda/feedstock_root/build_artifacts/unicodedata2_1763054696114/work
+urllib3==2.6.3

save_audio_feats.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os.path
+import pandas as pd
+from towhee import pipe, ops
+import torch
+from configs import args
+import torchaudio
+import tempfile
+def preprocess_audio_to_mono(input_path, target_sr=16000, keep_original_format=True):
+    waveform, sample_rate = torchaudio.load(input_path)
+    original_encoding = torchaudio.info(input_path).encoding
+    if waveform.shape[0] > 1:
+        waveform = waveform[:1, :]
+    temp_fd, temp_path = tempfile.mkstemp(suffix='.wav')
+    os.close(temp_fd)
+    if keep_original_format and original_encoding == "PCM_S":
+        waveform = (waveform * 32767).to(torch.short)  # float -> int16
+        torchaudio.save(temp_path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
+    else:
+        torchaudio.save(temp_path, waveform, sample_rate)
+    return temp_path
+audio_vggish_pipeline = (  # pipeline building
+     pipe.input('path')
+     .map('path', 'frame', ops.audio_decode.ffmpeg())
+     .map('frame', 'vecs', ops.audio_embedding.vggish())
+     .output('vecs')
+)
+data_dir = args.data_dir
+# test_id = 'zxis5LLvULw_12000_22000'
+# test_path = f'{data_dir}/media/{test_id}/audio.wav'
+# temp_path = preprocess_audio_to_mono(test_path)
+# print(f"original audio info: {torchaudio.info(test_path)}")
+# print(f"mono audio info: :{torchaudio.info(temp_path)}")
+# test_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
+# print(test_embed.shape)
+# os.unlink(temp_path)
+#
+#
+# test_id = 'null_c-45AfEdAU050_99000_109000'
+# test_path = f'{data_dir}/media/{test_id}/audio.wav'
+# temp_path = preprocess_audio_to_mono(test_path)
+# print(f"original audio info: {torchaudio.info(test_path)}")
+# print(f"mono audio info: :{torchaudio.info(temp_path)}")
+# test_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
+# print(test_embed.shape)
+# os.unlink(temp_path)
+metapath = os.path.join(data_dir, 'metadata.csv')
+metadata = pd.read_csv(metapath, header=0)
+metadata = metadata[metadata['split'].isin(['train', 'val', 'test_s', 'test_u', 'test_n'])]
+# metadata = metadata[metadata['split'].isin(['test_s'])]
+vids = metadata['uid'].apply(lambda x: x.rsplit('_', 2)[0]).unique()
+save_dir = os.path.join(data_dir, 'audio_embed')
+os.makedirs(save_dir, exist_ok=True)
+for vid in vids:
+    audio_path = f'{data_dir}/media/{vid}/audio.wav'
+    temp_path = preprocess_audio_to_mono(audio_path)
+    audio_embed = torch.tensor(audio_vggish_pipeline(temp_path).get()[0])
+    os.unlink(temp_path)
+    # print(f"{vid}: {audio_embed.shape}")
+    torch.save(audio_embed, f'{save_dir}/{vid}.pt')
+    print(f'{vid} embedding saved {audio_embed.shape}')

save_sam_feats.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os.path
+from models.segment_anything import build_sam_vit_h
+from models.segment_anything.utils.transforms import ResizeLongestSide
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pandas as pd
+from configs import args
+from save_audio_feats import data_dir
+def preprocess(x: torch.Tensor, device='cuda') -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # 确保输入张量在正确的设备上
+    x = x.to(device)
+    # Normalize colors
+    pixel_mean = torch.Tensor([113.263, 99.370, 92.492]).view(-1, 1, 1).to(device)
+    pixel_std = torch.Tensor([64.274, 61.068, 58.626]).view(-1, 1, 1).to(device)
+    img_size = 1024
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Using device: {device}")
+data_dir = args.data_dir
+metapath = os.path.join(data_dir, 'metadata.csv')
+metadata = pd.read_csv(metapath, header=0)
+metadata = metadata[metadata['split'].isin(['train', 'val', 'test_s', 'test_u', 'test_n'])]
+# metadata = metadata[metadata['split'].isin(['test_s'])]
+vids = metadata['uid'].apply(lambda x: x.rsplit('_', 2)[0]).unique()
+sam_model = build_sam_vit_h(args.vision_pretrained)
+sam_model.to(device)
+for param in sam_model.parameters():
+    param.requires_grad = False
+save_dir = os.path.join(data_dir, 'image_embed')
+os.makedirs(save_dir, exist_ok=True)
+torch.cuda.empty_cache()
+for vid in vids:
+    image_embeds = []
+    for _idx in range(10):
+        path_frame = f'{data_dir}/media/{vid}/frames/{_idx}.jpg'
+        frame = cv2.imread(path_frame)
+        if frame is None:
+            print(f"Warning: Could not read image {path_frame}")
+            continue
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame = ResizeLongestSide(1024).apply_image(frame)
+        frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).contiguous()  # [3, H, W]
+        frame_processed = preprocess(frame_tensor, device)  # [3, 1024, 1024]
+        single_image = frame_processed.unsqueeze(0)  # [1, 3, 1024, 1024]
+        with torch.no_grad():
+            image_embed = sam_model.image_encoder(single_image)  # [1, 256, 64, 64]
+            image_embed = image_embed.squeeze(0).cpu()
+        image_embeds.append(image_embed)
+        torch.cuda.empty_cache()
+    if not image_embeds:
+        print(f"Error: No images loaded for video {vid}")
+        continue
+    image_embeds_stacked = torch.stack(image_embeds, dim=0)  # [T, 256, 64, 64]
+    torch.save(image_embeds_stacked, f'{save_dir}/{vid}.pt')
+    print(f"Processed video {vid}, features shape: {image_embeds_stacked.shape}")
+print("Processing completed!")

seg_ltpo.py ADDED Viewed

	@@ -0,0 +1,786 @@

+"""
+SEG-LTPO: test-time optimization of SimToken's Fseg / q prompt token.
+Two optimizers are provided:
+ltpo_optimize  – original antithetic-ES zeroth-order optimizer (Fseg space).
+q_ltpo_autograd – autograd optimizer that directly optimizes q (= sparse
+                  prompt embedding passed to the mask decoder) via Adam
+                  maximize, with a differentiable reward.  This is the
+                  recommended path when the reward can be made differentiable.
+Staged autograd reward build-up:
+  Stage 0  check_grad_connectivity  — verify ∂R_iou/∂q ≠ 0
+  Stage 1  QLTPOConfig(stage=1)     — R = 0.6·R_iou − 0.2·R_area_soft − λ_reg·‖q−q₀‖²
+  Stage 2  QLTPOConfig(stage=2)     — Stage 1 + 1.0·R_align_det  (z_in/z_out stopgrad)
+  Stage 3  QLTPOConfig(stage=3)     — Stage 2 + 0.2·R_temp_feat  (full reward)
+Reward gating: use best_q only when R_task(best_q) > R_task(q_init) + gate_delta.
+--- ES baseline (original) ---
+Reward:
+    R = λ1·R_temp_feat + λ2·R_iou_pred + λ3·R_align_contrast − λ4·R_area
+Update (antithetic ES, step t):
+    F_curr = F_curr + η_t · (R+ − R−)/(2σ_t²) · eps_t
+    best_F = argmax_F R(F)
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+# ---------------------------------------------------------------------------
+# Per-sample diagnostics accumulator for q_ltpo_autograd
+# ---------------------------------------------------------------------------
+_q_ltpo_stats: List[Dict[str, Any]] = []
+def reset_q_ltpo_stats() -> None:
+    global _q_ltpo_stats
+    _q_ltpo_stats = []
+def get_q_ltpo_stats() -> List[Dict[str, Any]]:
+    return list(_q_ltpo_stats)
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class LTPOConfig:
+    T: int = 5
+    num_anchors: int = 4
+    sigma_schedule: List[float] = field(
+        default_factory=lambda: [0.10, 0.08, 0.06, 0.04, 0.02]
+    )
+    eta_scale: float = 0.5      # η_t = eta_scale · σ_t
+    # Reward weights
+    lambda1: float = 0.3        # R_temp_feat
+    lambda2: float = 0.4        # R_iou_pred
+    lambda3: float = 1.0        # R_align_contrast
+    lambda4: float = 0.3        # R_area penalty
+    beta: float = 0.5           # background penalty coefficient in R_align_contrast
+    # Reward gating: fall back to F_init when improvement < gate_delta
+    gate_delta: float = 0.0
+    # L2 trust-region radius on Fseg; None = disabled
+    trust_delta: Optional[float] = None
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+def get_sam_model(model):
+    """Return SAM visual_model, unwrapping a PeftModel wrapper if present."""
+    base = model.base_model.model if hasattr(model, "base_model") else model
+    return base.model.visual_model
+def get_anchor_indices(num_frames: int, num_anchors: int) -> List[int]:
+    """Uniformly sample anchor frame indices from [0, num_frames-1]."""
+    return [round(v) for v in torch.linspace(0, num_frames - 1, num_anchors).tolist()]
+def _precompute_dense_emb(
+    sam_model, model_dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Constant 'no-mask' dense embedding from SAM's prompt encoder.
+    Independent of Fseg; precompute once per sample to avoid redundant calls.
+    Shape: [1, 256, 64, 64].
+    """
+    pe = sam_model.prompt_encoder
+    H, W = pe.image_embedding_size
+    return (
+        pe.no_mask_embed.weight           # [1, 256]
+        .reshape(1, -1, 1, 1)
+        .expand(1, -1, H, W)
+        .contiguous()
+        .to(model_dtype)
+        .to(device)
+    )
+# ---------------------------------------------------------------------------
+# Lightweight SAM decode (skips prompt_encoder overhead)
+# ---------------------------------------------------------------------------
+def _decode_on_anchors(
+    fseg: torch.Tensor,                 # [1, 256] float32
+    image_embeds_anchor: torch.Tensor,  # [A, 256, 64, 64] model dtype
+    dense_emb: torch.Tensor,            # [1, 256, 64, 64] model dtype (constant)
+    mask_decoder,
+    dense_pe: torch.Tensor,             # [1, 256, 64, 64]
+    model_dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Decode anchor frames for a given Fseg.
+    Since no points/boxes are used, prompt_encoder simply concatenates
+    text_embeds onto an empty sparse tensor, so sparse_emb == Fseg.unsqueeze(1).
+    We exploit this to skip the full prompt_encoder call each iteration.
+    Returns:
+        low_res_masks: [A, 1, 256, 256]
+        iou_preds:     [A, 1]
+    """
+    sparse_emb = fseg.to(model_dtype).unsqueeze(1)  # [1, 1, 256]
+    with torch.no_grad():
+        low_res_masks, iou_preds = mask_decoder(
+            image_embeddings=image_embeds_anchor,
+            image_pe=dense_pe,
+            sparse_prompt_embeddings=sparse_emb,
+            dense_prompt_embeddings=dense_emb,
+            multimask_output=False,
+        )
+    return low_res_masks, iou_preds  # [A,1,256,256], [A,1]
+# ---------------------------------------------------------------------------
+# Reward computation
+# ---------------------------------------------------------------------------
+def _compute_reward(
+    fseg: torch.Tensor,                 # [1, 256] float32
+    low_res_masks: torch.Tensor,        # [A, 1, 256, 256]
+    iou_preds: torch.Tensor,            # [A, 1]
+    image_embeds_anchor: torch.Tensor,  # [A, 256, 64, 64]
+    cfg: LTPOConfig,
+) -> float:
+    num_anchor = low_res_masks.shape[0]
+    device = fseg.device
+    # Work entirely in float32 for numerical stability
+    masks_soft = torch.sigmoid(low_res_masks.float().squeeze(1))  # [A, 256, 256]
+    img_embs   = image_embeds_anchor.float()                       # [A, 256, 64, 64]
+    # q lives in SAM's 256-d prompt space (same as Fseg after text_hidden_fcs)
+    q = F.normalize(fseg[0].float(), dim=0)  # [256]
+    # Downsample soft masks 256×256 → 64×64 to match image_embed spatial dims.
+    # Keep as soft weights (no hard threshold) so the reward surface is smooth.
+    masks_64 = F.interpolate(
+        masks_soft.unsqueeze(1), size=(64, 64),
+        mode="bilinear", align_corners=False,
+    ).squeeze(1)  # [A, 64, 64]
+    # ── Per-frame masked pooling ──────────────────────────────────────────
+    z_ins:  List[torch.Tensor] = []
+    z_outs: List[torch.Tensor] = []
+    for t in range(num_anchor):
+        m   = masks_64[t]   # [64, 64]
+        img = img_embs[t]   # [256, 64, 64]
+        # Soft weighted average pooling over foreground / background
+        z_in  = (img * m.unsqueeze(0)).sum(dim=[1, 2]) / (m.sum() + 1e-6)
+        z_out = (img * (1.0 - m).unsqueeze(0)).sum(dim=[1, 2]) / ((1.0 - m).sum() + 1e-6)
+        z_ins.append(F.normalize(z_in,  dim=0))   # [256]
+        z_outs.append(F.normalize(z_out, dim=0))  # [256]
+    # ── R_align_contrast ──────────────────────────────────────────────────
+    # Maximise Fseg↔inside alignment while penalising Fseg↔outside alignment.
+    # Contrast term prevents reward-hacking via large masks:
+    # a large mask pulls inside and outside features together, shrinking the gap.
+    r_align = sum(
+        (q @ z_ins[t]) - cfg.beta * (q @ z_outs[t])
+        for t in range(num_anchor)
+    ) / num_anchor
+    # ── R_iou_pred ────────────────────────────────────────────────────────
+    # SAM's internal mask-quality head, calibrated during SAM training.
+    r_iou = iou_preds.float().mean()
+    # ── R_temp_feat ───────────────────────────────────────────────────────
+    # Feature-space consistency between adjacent anchor frames.
+    # Harder to game than mask-IoU: large masks pool diverse background
+    # features across frames, degrading cosine similarity.
+    r_temp = torch.tensor(0.0, device=device)
+    if num_anchor > 1:
+        r_temp = sum(
+            z_ins[t] @ z_ins[t + 1] for t in range(num_anchor - 1)
+        ) / (num_anchor - 1)
+    # ── R_area ────────────────────────────────────────────────────────────
+    r_area = masks_64.mean()
+    R = (cfg.lambda1 * r_temp
+         + cfg.lambda2 * r_iou
+         + cfg.lambda3 * r_align
+         - cfg.lambda4 * r_area)
+    return R.item()
+# ---------------------------------------------------------------------------
+# Ablation baseline: Best-of-2 Random (no iterative update)
+# ---------------------------------------------------------------------------
+def best_of_2_optimize(
+    F_init: torch.Tensor,
+    image_embeds: torch.Tensor,
+    anchor_indices: List[int],
+    sam_model,
+    model_dtype: torch.dtype,
+    cfg: LTPOConfig,
+) -> torch.Tensor:
+    """
+    Best-of-2 Random baseline.
+    Sample one antithetic pair (F+, F-) using the first sigma value,
+    evaluate both, return whichever has the higher reward.
+    No iterative update — serves as the ablation for the update rule.
+    Same reward gating as ltpo_optimize for a fair comparison.
+    """
+    device = F_init.device
+    image_embeds_anchor = image_embeds[anchor_indices]
+    dense_emb = _precompute_dense_emb(sam_model, model_dtype, device)
+    dense_pe  = sam_model.prompt_encoder.get_dense_pe().to(device)
+    mask_dec  = sam_model.mask_decoder
+    lrm0, iou0 = _decode_on_anchors(
+        F_init, image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+    )
+    R_init = _compute_reward(F_init, lrm0, iou0, image_embeds_anchor, cfg)
+    sigma = cfg.sigma_schedule[0]
+    eps   = torch.randn_like(F_init) * sigma
+    F_plus  = F_init + eps
+    F_minus = F_init - eps
+    lrm_p, iou_p = _decode_on_anchors(
+        F_plus,  image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+    )
+    lrm_m, iou_m = _decode_on_anchors(
+        F_minus, image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+    )
+    R_plus  = _compute_reward(F_plus,  lrm_p, iou_p, image_embeds_anchor, cfg)
+    R_minus = _compute_reward(F_minus, lrm_m, iou_m, image_embeds_anchor, cfg)
+    best_R, best_F = R_init, F_init.clone()
+    if R_plus  > best_R: best_R, best_F = R_plus,  F_plus.clone()
+    if R_minus > best_R: best_R, best_F = R_minus, F_minus.clone()
+    if best_R <= R_init + cfg.gate_delta:
+        return F_init
+    return best_F
+# ---------------------------------------------------------------------------
+# Full-video decode with a given Fseg
+# ---------------------------------------------------------------------------
+def decode_full_video(
+    fseg: torch.Tensor,          # [1, 256] float32
+    image_embeds: torch.Tensor,  # [T, 256, 64, 64] model dtype on CUDA
+    sam_model,
+    resize: tuple,               # (H_resized, W_resized) – after ResizeLongestSide
+    orgsize: tuple,              # (H_orig, W_orig)
+    model_dtype: torch.dtype,
+) -> torch.Tensor:
+    """
+    Decode all T frames with the given Fseg.
+    Returns raw logit mask [T, H_orig, W_orig] (not yet sigmoid).
+    """
+    device = image_embeds.device
+    dense_emb = _precompute_dense_emb(sam_model, model_dtype, device)
+    dense_pe  = sam_model.prompt_encoder.get_dense_pe().to(device)
+    sparse_emb = fseg.to(model_dtype).unsqueeze(1)  # [1, 1, 256]
+    with torch.no_grad():
+        low_res_masks, _ = sam_model.mask_decoder(
+            image_embeddings=image_embeds,      # [T, 256, 64, 64]
+            image_pe=dense_pe,
+            sparse_prompt_embeddings=sparse_emb,  # [1, 1, 256]
+            dense_prompt_embeddings=dense_emb,    # [1, 256, 64, 64]
+            multimask_output=False,
+        )  # [T, 1, 256, 256]
+    pred_mask = sam_model.postprocess_masks(
+        low_res_masks, input_size=resize, original_size=orgsize
+    )  # [T, 1, H, W]
+    return pred_mask.squeeze(1)  # [T, H, W]
+# ---------------------------------------------------------------------------
+# Main optimisation loop
+# ---------------------------------------------------------------------------
+def ltpo_optimize(
+    F_init: torch.Tensor,          # [1, 256] float32 on CUDA
+    image_embeds: torch.Tensor,    # [T, 256, 64, 64] model dtype on CUDA
+    anchor_indices: List[int],
+    sam_model,
+    model_dtype: torch.dtype,
+    cfg: LTPOConfig,
+) -> torch.Tensor:
+    """
+    Optimise Fseg at test time via antithetic ES.
+    Returns best Fseg found [1, 256] float32.
+    Falls back to F_init when reward gating rejects all updates.
+    """
+    device = F_init.device
+    image_embeds_anchor = image_embeds[anchor_indices]  # [A, 256, 64, 64]
+    # Precompute constants shared across every optimisation step
+    dense_emb = _precompute_dense_emb(sam_model, model_dtype, device)
+    dense_pe  = sam_model.prompt_encoder.get_dense_pe().to(device)
+    mask_dec  = sam_model.mask_decoder
+    # ── Evaluate initial token ────────────────────────────────────────────
+    lrm0, iou0 = _decode_on_anchors(
+        F_init, image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+    )
+    R_init = _compute_reward(F_init, lrm0, iou0, image_embeds_anchor, cfg)
+    best_F, best_R = F_init.clone(), R_init
+    F_curr = F_init.clone()
+    # ── Optimisation loop ─────────────────────────────────────────────────
+    for t in range(cfg.T):
+        sigma_t = cfg.sigma_schedule[t]
+        eta_t   = cfg.eta_scale * sigma_t
+        eps     = torch.randn_like(F_curr) * sigma_t
+        F_plus  = F_curr + eps
+        F_minus = F_curr - eps
+        lrm_p, iou_p = _decode_on_anchors(
+            F_plus,  image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+        )
+        lrm_m, iou_m = _decode_on_anchors(
+            F_minus, image_embeds_anchor, dense_emb, mask_dec, dense_pe, model_dtype
+        )
+        R_plus  = _compute_reward(F_plus,  lrm_p, iou_p, image_embeds_anchor, cfg)
+        R_minus = _compute_reward(F_minus, lrm_m, iou_m, image_embeds_anchor, cfg)
+        # Track the best token seen across all evaluated candidates
+        if R_plus > best_R:
+            best_R, best_F = R_plus,  F_plus.clone()
+        if R_minus > best_R:
+            best_R, best_F = R_minus, F_minus.clone()
+        # Antithetic policy-gradient update of the iterate
+        # Formula: F_{t+1} = F_t + η_t · (R+ - R−)/(2σ_t²) · eps_t
+        grad_est = (R_plus - R_minus) / (2.0 * sigma_t ** 2)
+        F_curr   = F_curr + eta_t * grad_est * eps
+        # Optional L2 trust-region: keep F_curr within radius trust_delta of F_init
+        if cfg.trust_delta is not None:
+            diff = F_curr - F_init
+            norm = diff.norm()
+            if norm > cfg.trust_delta:
+                F_curr = F_init + diff * (cfg.trust_delta / norm)
+    # ── Reward gating ─────────────────────────────────────────────────────
+    # Reject the update when there is no meaningful improvement over the
+    # initial token (handles Null-like samples where no target exists).
+    if best_R <= R_init + cfg.gate_delta:
+        return F_init
+    return best_F
+# ===========================================================================
+# q-LTPO-autograd: differentiable test-time optimization of the prompt token
+# ===========================================================================
+@dataclass
+class QLTPOConfig:
+    """Configuration for q_ltpo_autograd (Stages 1–3).
+    stage controls which reward terms are active:
+      1  R_iou + R_area_soft + reg            (gradient connectivity + stability)
+      2  Stage 1 + R_align_det (z stopgrad)   (semantic alignment)
+      3  Stage 2 + R_temp_feat                (full reward)
+    """
+    stage: int = 1
+    T: int = 5
+    num_anchors: int = 4
+    # ── Optimizer ──────────────────────────────────────────────────────────
+    # lr=0  → auto-set to 0.01 × RMS(q_init); any positive value is used directly
+    lr: float = 0.0
+    # max_drift=0 → auto-set to 0.5 × ‖q_init‖; any positive value is a hard radius
+    max_drift: float = 0.0
+    # ── Stage 1 reward weights ─────────────────────────────────────────────
+    lambda_iou: float = 0.6
+    lambda_area: float = 0.2
+    lambda_reg: float = 0.01
+    area_temp: float = 5.0      # sigmoid temperature for R_area_soft
+    # ── Stage 2 additional weights ─────────────────────────────────────────
+    lambda_align: float = 1.0
+    beta_align: float = 0.5     # background penalty coefficient in R_align
+    # ── Stage 3 additional weights ─────────────────────────────────────────
+    lambda_temp: float = 0.2
+    # ── Gating ─────────────────────────────────────────────────────────────
+    gate_delta: float = 0.0
+    # ── e0-modulated R_iou (principled Null-safety) ────────────────────────
+    # e0 = stopgrad(R_area_soft(q_init)): the initial soft-area fraction acts
+    # as an existence prior on the R_iou term.
+    #   "none"     → original behavior (e0 = 1, no modulation)
+    #   "identity" → e0 = R_area_soft(q_init)          [first version]
+    #   "sqrt"     → e0 = sqrt(R_area_soft(q_init) + e0_eps)
+    e0_modulation: str = "identity"
+    e0_eps: float = 1e-4   # epsilon for "sqrt" variant
+    # ── Oracle Null-safety gate (analysis only; NOT for final method) ──────
+    # Derived from test-set distribution (Null area_hard ≈ 0.01, Seen ≈ 0.05)
+    # so must not be used in reported results.  Set null_gate_delta=0 to disable.
+    null_area_threshold: float = 0.02   # hard area fraction below which guard activates
+    null_gate_delta: float = 0.0        # 0 = disabled; 0.05 = oracle experiment
+# ---------------------------------------------------------------------------
+# e0 helper
+# ---------------------------------------------------------------------------
+def _compute_e0(r_area_soft_init: float, cfg: "QLTPOConfig") -> float:
+    """Compute the existence-prior weight from the initial soft area."""
+    if cfg.e0_modulation == "identity":
+        return r_area_soft_init
+    if cfg.e0_modulation == "sqrt":
+        return (r_area_soft_init + cfg.e0_eps) ** 0.5
+    return 1.0  # "none"
+# ---------------------------------------------------------------------------
+# Differentiable anchor decode (float32 throughout; no torch.no_grad)
+# ---------------------------------------------------------------------------
+def _decode_on_anchors_diff(
+    q: torch.Tensor,                        # [1, 256] float32
+    image_embeds_anchor_fp32: torch.Tensor, # [A, 256, 64, 64] float32
+    dense_emb_fp32: torch.Tensor,           # [1, 256, 64, 64] float32
+    mask_decoder,
+    dense_pe_fp32: torch.Tensor,            # [1, 256, 64, 64] float32
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Differentiable mask-decoder forward.
+    All inputs are float32 to avoid fp16 gradient truncation.
+    q may be a Parameter (requires_grad=True) or a plain detached tensor.
+    Returns low_res_masks [A,1,256,256] and iou_preds [A,1], both float32.
+    """
+    sparse_emb = q.unsqueeze(1)  # [1, 1, 256]
+    low_res_masks, iou_preds = mask_decoder(
+        image_embeddings=image_embeds_anchor_fp32,
+        image_pe=dense_pe_fp32,
+        sparse_prompt_embeddings=sparse_emb,
+        dense_prompt_embeddings=dense_emb_fp32,
+        multimask_output=False,
+    )
+    return low_res_masks, iou_preds  # [A,1,256,256], [A,1]
+# ---------------------------------------------------------------------------
+# Differentiable reward components
+# ---------------------------------------------------------------------------
+def _task_reward_stage1(
+    lrm: torch.Tensor,   # [A,1,256,256] float32
+    iou: torch.Tensor,   # [A,1] float32
+    cfg: QLTPOConfig,
+    e0: float = 1.0,
+) -> torch.Tensor:
+    """Task reward (no regularization): used for best_q tracking and gating.
+    e0 is the stopgrad existence prior: R_area_soft(q_init) scaled via
+    cfg.e0_modulation.  When e0 << 1 the iou term is suppressed, so the
+    optimizer sees only the area-penalty gradient and naturally tends toward
+    smaller (more conservative) masks — the correct behavior when the initial
+    prediction is near-empty (Null frames).
+    """
+    r_iou  = iou.mean()
+    r_area = torch.sigmoid(lrm / cfg.area_temp).mean()
+    return cfg.lambda_iou * e0 * r_iou - cfg.lambda_area * r_area
+def _task_reward_stage2(
+    q: torch.Tensor,      # [1, 256] float32
+    lrm: torch.Tensor,    # [A,1,256,256] float32
+    iou: torch.Tensor,    # [A,1] float32
+    image_embeds_anchor_fp32: torch.Tensor,  # [A, 256, 64, 64] float32
+    cfg: QLTPOConfig,
+    e0: float = 1.0,
+) -> torch.Tensor:
+    """Stage 2 task reward: Stage 1 + R_align_det (z_in/z_out are stopgrad)."""
+    r_s1 = _task_reward_stage1(lrm, iou, cfg, e0)
+    A = lrm.shape[0]
+    masks_64 = F.interpolate(
+        torch.sigmoid(lrm.squeeze(1) / cfg.area_temp).unsqueeze(1),
+        size=(64, 64), mode="bilinear", align_corners=False,
+    ).squeeze(1)  # [A, 64, 64]
+    q_norm = F.normalize(q[0], dim=0)  # [256]
+    r_align = torch.tensor(0.0, device=q.device)
+    for t in range(A):
+        m   = masks_64[t].detach()          # stopgrad on z_in/z_out
+        img = image_embeds_anchor_fp32[t]   # [256, 64, 64]
+        z_in  = F.normalize((img * m.unsqueeze(0)).sum(dim=[1, 2]) / (m.sum() + 1e-6), dim=0)
+        z_out = F.normalize((img * (1 - m).unsqueeze(0)).sum(dim=[1, 2]) / ((1 - m).sum() + 1e-6), dim=0)
+        r_align = r_align + q_norm @ z_in - cfg.beta_align * (q_norm @ z_out)
+    r_align = r_align / A
+    return r_s1 + cfg.lambda_align * r_align
+def _task_reward_stage3(
+    q: torch.Tensor,
+    lrm: torch.Tensor,
+    iou: torch.Tensor,
+    image_embeds_anchor_fp32: torch.Tensor,
+    cfg: QLTPOConfig,
+    e0: float = 1.0,
+) -> torch.Tensor:
+    """Stage 3 task reward: Stage 2 + R_temp_feat."""
+    r_s2 = _task_reward_stage2(q, lrm, iou, image_embeds_anchor_fp32, cfg, e0)
+    A = lrm.shape[0]
+    if A < 2:
+        return r_s2
+    masks_64 = F.interpolate(
+        torch.sigmoid(lrm.squeeze(1) / cfg.area_temp).unsqueeze(1),
+        size=(64, 64), mode="bilinear", align_corners=False,
+    ).squeeze(1)  # [A, 64, 64]
+    z_ins = []
+    for t in range(A):
+        m   = masks_64[t].detach()
+        img = image_embeds_anchor_fp32[t]
+        z_in = F.normalize((img * m.unsqueeze(0)).sum(dim=[1, 2]) / (m.sum() + 1e-6), dim=0)
+        z_ins.append(z_in)
+    r_temp = sum(z_ins[t] @ z_ins[t + 1] for t in range(A - 1)) / (A - 1)
+    return r_s2 + cfg.lambda_temp * r_temp
+def _compute_task_reward(
+    q: torch.Tensor,
+    lrm: torch.Tensor,
+    iou: torch.Tensor,
+    image_embeds_anchor_fp32: torch.Tensor,
+    cfg: QLTPOConfig,
+    e0: float = 1.0,
+) -> torch.Tensor:
+    """Dispatch to the correct stage's task reward."""
+    if cfg.stage == 1:
+        return _task_reward_stage1(lrm, iou, cfg, e0)
+    if cfg.stage == 2:
+        return _task_reward_stage2(q, lrm, iou, image_embeds_anchor_fp32, cfg, e0)
+    return _task_reward_stage3(q, lrm, iou, image_embeds_anchor_fp32, cfg, e0)
+def _compute_full_reward(
+    q: torch.Tensor,
+    lrm: torch.Tensor,
+    iou: torch.Tensor,
+    image_embeds_anchor_fp32: torch.Tensor,
+    q_init: torch.Tensor,
+    cfg: QLTPOConfig,
+    e0: float = 1.0,
+) -> torch.Tensor:
+    """Full reward = task reward + L2 regularization (used for backward)."""
+    r_task = _compute_task_reward(q, lrm, iou, image_embeds_anchor_fp32, cfg, e0)
+    r_reg  = (q - q_init).pow(2).sum()
+    return r_task - cfg.lambda_reg * r_reg
+# ---------------------------------------------------------------------------
+# Stage 0: gradient connectivity check
+# ---------------------------------------------------------------------------
+def check_grad_connectivity(
+    F_init: torch.Tensor,         # [1, 256] any dtype
+    image_embeds: torch.Tensor,   # [T, 256, 64, 64] any dtype
+    anchor_indices: List[int],
+    sam_model,
+    model_dtype: torch.dtype,
+    num_steps: int = 5,
+    lr: float = 0.0,
+) -> dict:
+    """Stage 0: verify ∂R_iou_pred/∂q ≠ 0 and reward rises with Adam maximize.
+    Runs num_steps of Adam on R = R_iou_pred only (the simplest differentiable
+    reward, no custom ops required).  Returns a diagnostic dict.
+    Usage:
+        diag = check_grad_connectivity(F_init, image_embeds, anchors, sam, dtype)
+        print(diag['grad_norm_step0'], diag['reward_trajectory'])
+        # expect grad_norm > 0 and rewards non-decreasing
+    """
+    device = F_init.device
+    image_embeds_anchor = image_embeds[anchor_indices].float().detach()
+    dense_emb = _precompute_dense_emb(sam_model, model_dtype, device).float().detach()
+    dense_pe  = sam_model.prompt_encoder.get_dense_pe().to(device).float().detach()
+    mask_dec  = sam_model.mask_decoder
+    q_init_fp32 = F_init.float().detach()
+    if lr <= 0:
+        lr = 0.01 * (q_init_fp32.norm() / (q_init_fp32.numel() ** 0.5)).item()
+    q = torch.nn.Parameter(q_init_fp32.clone())
+    optimizer = torch.optim.Adam([q], lr=lr, maximize=True)
+    grad_norms, rewards = [], []
+    for step in range(num_steps):
+        optimizer.zero_grad()
+        lrm, iou = _decode_on_anchors_diff(q, image_embeds_anchor, dense_emb, mask_dec, dense_pe)
+        R = iou.mean()
+        R.backward()
+        grad_norm = q.grad.norm().item() if q.grad is not None else 0.0
+        grad_norms.append(grad_norm)
+        rewards.append(R.item())
+        optimizer.step()
+    return {
+        "grad_norm_step0": grad_norms[0],
+        "grad_norms": grad_norms,
+        "reward_trajectory": rewards,
+        "gradient_connected": grad_norms[0] > 1e-8,
+    }
+# ---------------------------------------------------------------------------
+# Stage 1–3: q-LTPO-autograd main optimizer
+# ---------------------------------------------------------------------------
+def q_ltpo_autograd(
+    F_init: torch.Tensor,         # [1, 256] any dtype on CUDA
+    image_embeds: torch.Tensor,   # [T, 256, 64, 64] any dtype on CUDA
+    anchor_indices: List[int],
+    sam_model,
+    model_dtype: torch.dtype,
+    cfg: QLTPOConfig,
+) -> torch.Tensor:
+    """Optimise the SAM prompt token q at test time via Adam maximize.
+    q is initialised to F_init (= Fseg after text_hidden_fcs projection).
+    The prompt encoder is bypassed: sparse_emb = q.unsqueeze(1), identical
+    to what prompt_encoder produces when text_embeds is the only prompt.
+    All computation is done in float32 to avoid fp16 gradient truncation.
+    Returns best_q as float32 [1, 256].  Falls back to F_init when gating
+    rejects all updates.
+    """
+    device = F_init.device
+    # ── Precompute constants (float32, detached) ──────────────────────────
+    q_init_fp32 = F_init.float().detach()
+    image_embeds_anchor = image_embeds[anchor_indices].float().detach()
+    dense_emb = _precompute_dense_emb(sam_model, model_dtype, device).float().detach()
+    dense_pe  = sam_model.prompt_encoder.get_dense_pe().to(device).float().detach()
+    mask_dec  = sam_model.mask_decoder
+    # ── Auto-scale lr and max_drift from q_init magnitude ─────────────────
+    rms = q_init_fp32.norm() / (q_init_fp32.numel() ** 0.5)
+    lr        = cfg.lr        if cfg.lr        > 0 else 0.01 * rms.item()
+    max_drift = cfg.max_drift if cfg.max_drift > 0 else 0.5  * q_init_fp32.norm().item()
+    # ── Baseline forward + e0 existence prior ────────────────────────────
+    with torch.no_grad():
+        lrm0, iou0 = _decode_on_anchors_diff(
+            q_init_fp32, image_embeds_anchor, dense_emb, mask_dec, dense_pe
+        )
+        # e0 = stopgrad(R_area_soft(q_init)): fixes the scalar before the loop.
+        # Suppresses R_iou when the initial mask is near-empty (existence prior).
+        r_area_soft_init = torch.sigmoid(lrm0 / cfg.area_temp).mean().item()
+        e0 = _compute_e0(r_area_soft_init, cfg)
+        R_init_task = _compute_task_reward(
+            q_init_fp32, lrm0, iou0, image_embeds_anchor, cfg, e0=e0
+        ).item()
+    # ── Optimisation setup ────────────────────────────────────────────────
+    q = torch.nn.Parameter(q_init_fp32.clone())
+    optimizer = torch.optim.Adam([q], lr=lr, maximize=True)
+    best_q      = q.detach().clone()
+    best_reward = R_init_task
+    hit_clip    = False
+    # ── Optimisation loop ─────────────────────────────────────────────────
+    for step in range(cfg.T):
+        optimizer.zero_grad()
+        lrm, iou = _decode_on_anchors_diff(
+            q, image_embeds_anchor, dense_emb, mask_dec, dense_pe
+        )
+        R_full = _compute_full_reward(q, lrm, iou, image_embeds_anchor, q_init_fp32, cfg, e0=e0)
+        R_full.backward()
+        optimizer.step()
+        # Hard L2 norm clip: keep q within max_drift ball around q_init
+        with torch.no_grad():
+            diff = q - q_init_fp32
+            d    = diff.norm()
+            if d > max_drift:
+                q.copy_(q_init_fp32 + diff * (max_drift / d))
+                hit_clip = True
+        # Fresh no_grad forward on the post-step q_{N+1} for correct tracking.
+        # (Pre-step lrm/iou would mismatch the updated q, causing wrong best_q.)
+        with torch.no_grad():
+            lrm_eval, iou_eval = _decode_on_anchors_diff(
+                q.detach(), image_embeds_anchor, dense_emb, mask_dec, dense_pe
+            )
+            r_task = _compute_task_reward(
+                q.detach(), lrm_eval, iou_eval, image_embeds_anchor, cfg, e0=e0
+            ).item()
+            if r_task > best_reward:
+                best_reward = r_task
+                best_q = q.detach().clone()
+    # ── Reward gating: clean re-eval of best_q vs q_init ─────────────────
+    with torch.no_grad():
+        lrm_b, iou_b = _decode_on_anchors_diff(
+            best_q, image_embeds_anchor, dense_emb, mask_dec, dense_pe
+        )
+        R_best_task = _compute_task_reward(
+            best_q, lrm_b, iou_b, image_embeds_anchor, cfg, e0=e0
+        ).item()
+    area_init = (lrm0 > 0).float().mean().item()
+    effective_gate = (
+        cfg.null_gate_delta
+        if (cfg.null_gate_delta > 0 and area_init < cfg.null_area_threshold)
+        else cfg.gate_delta
+    )
+    accepted = R_best_task > R_init_task + effective_gate
+    # ── Per-sample diagnostics ────────────────────────────────────────────
+    _q_ltpo_stats.append({
+        "accepted":         accepted,
+        "reward_gain":      R_best_task - R_init_task,
+        "drift":            (best_q - q_init_fp32).norm().item(),
+        "hit_clip":         hit_clip,
+        "e0":               e0,
+        "R_iou_pred_init":  iou0.mean().item(),
+        "R_iou_pred_best":  iou_b.mean().item(),
+        "area_hard_init":   area_init,
+        "area_hard_best":   (lrm_b > 0).float().mean().item(),
+    })
+    if not accepted:
+        return F_init.float()
+    return best_q

train.py ADDED Viewed

	@@ -0,0 +1,534 @@

+import transformers
+from datasets import REFAVS
+from configs import args
+from torch.utils.data import DataLoader
+from functools import partial
+from models.llava import conversation as conversation_lib
+# from  models.avs_model import VISAForCausalLM
+from  models.avs_model import Simtoken_ForCausalLM
+import torch
+from transformers import AutoConfig
+from peft import LoraConfig, get_peft_model
+from torch import optim
+from torch.optim import AdamW
+from transformers import get_cosine_schedule_with_warmup
+from tqdm import tqdm
+from utils import utility
+import random
+import numpy as np
+import re
+import time
+import os
+import warnings
+warnings.filterwarnings("ignore")
+from transformers import logging
+logging.set_verbosity_error()
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+AUDIO_TOKEN_INDEX = -300
+DEFAULT_AUDIO_TOKEN = "<audio>"
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+def dict_to_cuda(input_dict):
+    for k, v in input_dict.items():
+        if isinstance(input_dict[k], torch.Tensor):
+            input_dict[k] = v.cuda(non_blocking=True)
+        elif (
+                isinstance(input_dict[k], list)
+                and len(input_dict[k]) > 0
+                and isinstance(input_dict[k][0], torch.Tensor)
+        ):
+            input_dict[k] = [ele.cuda(non_blocking=True) for ele in v]
+    return input_dict
+def tokenizer_image_audio_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, audio_token_index=AUDIO_TOKEN_INDEX, num_frames=10, return_tensors=None):
+    prompt_chunks = re.split(r'(<image>|<audio>|<video>)', prompt)
+    prompt_chunks = [chunk for chunk in prompt_chunks if chunk]
+    text_chunks = []
+    token_types = []
+    for chunk in prompt_chunks:
+        if chunk == "<image>":
+            token_types.append("image")
+        elif chunk == "<audio>":
+            token_types.append("audio")
+        elif chunk == "<video>":
+            token_types.append("video")
+        else:
+            text_chunks.append(chunk)
+    tokenized_chunks = [tokenizer(chunk).input_ids for chunk in text_chunks]
+    def insert_separators(text_chunks, tokenized_chunks, token_types, image_token_index, audio_token_index, num_frames):
+        input_ids = []
+        offset = 0
+        if (
+                len(tokenized_chunks) > 0
+                and len(tokenized_chunks[0]) > 0
+                and tokenized_chunks[0][0] == tokenizer.bos_token_id
+        ):
+            offset = 1
+            input_ids.append(tokenized_chunks[0][0])
+        min_length = min(len(text_chunks), len(token_types))
+        for i in range(min_length):
+            input_ids.extend(tokenized_chunks[i][offset:])
+            if token_types[i] == "image":
+                input_ids.append(image_token_index)
+            elif token_types[i] == "audio":
+                input_ids.append(audio_token_index)
+            elif token_types[i] == "video":
+                input_ids.extend([image_token_index] * num_frames)
+        if len(text_chunks) > min_length:
+            input_ids.extend(tokenized_chunks[min_length][offset:])
+        return input_ids
+    input_ids = insert_separators(text_chunks, tokenized_chunks, token_types, image_token_index, audio_token_index, num_frames)
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def collate_fn(batch, tokenizer=None):
+    vids = []
+    images = []
+    image_clips = []
+    masks = []
+    conversations = []
+    audio_feats = []
+    image_feats = []
+    resizes = []
+    orgsizes = []
+    refs = []
+    refs_num = []
+    fids = []
+    for data in batch:
+        vids.append(data['vid'])
+        images.append(data['image'])
+        image_clips.append(data['img_clip'])
+        masks.append(data['mask'])
+        conversations.append(data['conversation'])
+        audio_feats.append(data['feat_aud'])
+        resizes.append(data['resize'])
+        orgsizes.append(data['orgsize'])
+        image_feats.append(data['feat_sam'])
+        refs_num.append(len(data['ref']))
+        fids.append(data['fids'])
+        refs.append(data['ref'][0])
+    # input_ids = [tokenizer_image_token(conv, tokenizer, return_tensors="pt") for conv in conversations]
+    input_ids = [tokenizer_image_audio_token(conv, tokenizer, return_tensors="pt") for conv in conversations]  # list
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+    attention_masks = input_ids.ne(tokenizer.pad_token_id)
+    ref_ids = [tokenizer_image_audio_token(ref, tokenizer, return_tensors="pt") for ref in refs]
+    conv = conversation_lib.default_conversation.copy()
+    labels = input_ids.clone()
+    # sep = conv.sep + conv.roles[1] + ": " # “###Assistant：”
+    sep = 'Sure, it is [SEG]'
+    for conversation, target in zip(conversations, labels):
+        parts = conversation.split(sep)
+        # print(parts)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        sep_len = len(tokenizer_image_audio_token(sep, tokenizer)) - 1
+        for i in range(len(parts)-1):
+            part_len = len(tokenizer_image_audio_token(parts[i], tokenizer)) - 2
+            target[cur_len: cur_len + part_len] = IGNORE_INDEX
+            cur_len += part_len + sep_len
+        target[cur_len:] = IGNORE_INDEX
+    return {"vids": vids,
+            "images": images,  # list[B]:[T, 3, 1024, 1024]
+            "images_clip": image_clips,  # list[B]:[T, 3, 224, 224]
+            "masks": masks,  # list[B]:[num_ref, T, H, W]
+            "convs": conversations,  # list[B]: str
+            "input_ids": input_ids,  # list[B]:[max_len]
+            "attention_masks": attention_masks,  # list[B]:[max_len]
+            "labels": labels,  # list[B]:[max_len]
+            "audio_feats": audio_feats,  # list[B]:[10, 128]
+            "resizes": resizes,  # list[B]
+            "orgsizes": orgsizes,  # list[B]
+            "image_feats": image_feats,
+            "ref_ids": ref_ids,  # list[B]: [ref_id_len]
+            "refs_num": refs_num,
+            "fids": fids
+    }
+import torch.multiprocessing as mp
+if __name__ == "__main__":
+    mp.set_start_method("spawn")
+    set_seed(42)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.mllm,
+        cache_dir=None,
+        model_max_length=2048,  # 2048
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]  # 32000
+    print("seg_token_idx: ", seg_token_idx)
+    train_dataset = REFAVS('train', args, tokenizer, input_type='refer')
+    val_dataset_s_refer = REFAVS('test_s', args, tokenizer, input_type='refer')
+    val_dataset_u_refer = REFAVS('test_u', args, tokenizer, input_type='refer')
+    val_dataset_n_refer = REFAVS('test_n', args, tokenizer, input_type='refer')
+    g = torch.Generator()
+    g.manual_seed(42)
+    train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, worker_init_fn=seed_worker,collate_fn=partial(collate_fn, tokenizer=tokenizer), generator=g)
+    val_dataloader_s_refer = DataLoader(val_dataset_s_refer, batch_size=4, shuffle=False, num_workers=0, collate_fn=partial(collate_fn, tokenizer=tokenizer))
+    val_dataloader_u_refer = DataLoader(val_dataset_u_refer, batch_size=4, shuffle=False, num_workers=0, collate_fn=partial(collate_fn, tokenizer=tokenizer))
+    val_dataloader_n_refer = DataLoader(val_dataset_n_refer, batch_size=4, shuffle=False, num_workers=0, collate_fn=partial(collate_fn, tokenizer=tokenizer))
+    model_args = {
+        "train_mask_decoder": True,
+        "out_dim": 256,  # 256
+        "ce_loss_weight": 1.0,
+        "dice_loss_weight": 0.5,
+        "bce_loss_weight": 2.0,
+        "seg_token_idx": seg_token_idx,
+        "vision_pretrained": args.vision_pretrained,  # sam_vit_h_xxx.pth
+        "vision_tower": args.vision_tower,
+        "use_im_start_end": False,
+        "compress": args.compress,
+        "start": args.start,
+    }
+    model = Simtoken_ForCausalLM.from_pretrained(args.mllm, torch_dtype=torch.float32, low_cpu_mem_usage=True, **model_args)
+    print("\nmodel loaded")
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.enable_input_require_grads()
+    model.gradient_checkpointing_enable()
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch.float32, device="cuda")
+    model_args_from_pt = AutoConfig.from_pretrained(args.mllm)
+    model_args_from_pt.use_cluster = True
+    model_args_from_pt.freeze = False
+    model_args_from_pt.mm_tune = True
+    model_args_from_pt.spatial_cluster_rate0 = 64
+    model_args_from_pt.spatial_cluster_rate1 = 32
+    model_args_from_pt.spatial_cluster_rate2 = 16
+    model_args_from_pt.temporal_cluster_rate = 0.0625
+    model_args_from_pt.use_cluster = True
+    model_args_from_pt.vision_tune = False
+    model.get_model().initialize_cluster_modules(model_args_from_pt)
+    model.get_model().initialize_lisa_modules(model.get_model().config)
+    for p in vision_tower.parameters():
+        p.requires_grad = False
+    for p in model.get_model().mm_projector.parameters():
+        p.requires_grad = False
+    lora_r = 8
+    target_modules = "q_proj,v_proj"
+    if lora_r > 0:
+        def find_linear_layers(model, lora_target_modules):
+            cls = torch.nn.Linear
+            lora_module_names = set()
+            for name, module in model.named_modules():
+                if (
+                        isinstance(module, cls)
+                        and all(
+                    [
+                        x not in name
+                        for x in [
+                        "visual_model",
+                        "vision_tower",
+                        "mm_projector",
+                        "text_hidden_fcs",
+                        "audio_feature_layer",
+                    ]
+                    ]
+                )
+                        and any([x in name for x in lora_target_modules])
+                ):
+                    lora_module_names.add(name)
+            return sorted(list(lora_module_names))
+        lora_alpha = 16
+        lora_dropout = 0.05
+        lora_target_modules = find_linear_layers(
+            model, target_modules.split(",")
+        )
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_target_modules,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+        print("\nLora deployed")
+        model.print_trainable_parameters()
+    model = model.to("cuda")
+    model.resize_token_embeddings(len(tokenizer))
+    for name, param in model.audio_feature_layer.named_parameters():
+        param.requires_grad = True
+        # print(name, param.requires_grad)
+    # for name, param in model.token_compressor.named_parameters():
+    #     param.requires_grad = True
+    for n, p in model.named_parameters():
+        if any(
+                [
+                    x in n
+                    for x in ["lm_head", "embed_tokens", "mask_decoder", "text_hidden_fcs"]
+                ]
+        ):
+            p.requires_grad = True
+    print("will save train model")
+    def valuate(model, dataloader, args, name):
+        model.eval()
+        total_iou = 0
+        total_fscore = 0
+        count = 0
+        for batch in tqdm(dataloader, desc=f"Evaluating on {name}"):
+            input_dict = dict_to_cuda(batch)
+            with torch.no_grad():
+                output_dict = model.forward(images=input_dict["images"],
+                                            images_clip=input_dict["images_clip"],
+                                            audio_features=input_dict["audio_feats"],
+                                            image_features=input_dict["image_feats"],
+                                            input_ids=input_dict["input_ids"],
+                                            labels=input_dict["labels"],
+                                            attention_masks=input_dict["attention_masks"],
+                                            masks_list=input_dict["masks"],
+                                            resize_list=input_dict["resizes"],
+                                            orgsize_list=input_dict["orgsizes"],
+                                            conversation_list=input_dict["convs"],
+                                            refs_num=input_dict["refs_num"],
+                                            fids=input_dict["fids"],
+                                            vids=input_dict["vids"],
+                                            contrast=args.ct_weight,
+                                            ref_ids=input_dict["ref_ids"],
+                                            inference=True)
+            pred_masks = output_dict["pred_masks"]  # list[B]:[num_seg, T, H, W]
+            gt_masks = output_dict["gt_masks"]  # list[B]:[num_seg, T, H, W]
+            for i in range(len(pred_masks)):
+                num_seg = pred_masks[i].shape[0]
+                T = pred_masks[i].shape[1]
+                iou = utility.mask_iou(pred_masks[i], gt_masks[i])
+                fscore = utility.Eval_Fmeasure(pred_masks[i], gt_masks[i], None)
+                total_iou += iou * num_seg * T
+                total_fscore += fscore * num_seg * T
+                count += num_seg * T
+        print(f"\n  valuate on {name}:  miou: {total_iou/count}  fscore: {total_fscore/count}")
+        with open(os.path.join(args.log_root, f'{args.name}.txt'), "a") as f:
+            f.write(f"valuate on {name}:  miou {total_iou/count}  true fscore {total_fscore/count} \n")
+    # ---------------train------------------------------------------
+    model.train()
+    epochs = args.epochs
+    print("init lr:", args.lr)
+    optimizer = AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.95), weight_decay=0.01)
+    gradient_accumulation_steps = int(16 // args.batch_size)
+    step_per_epoch = len(train_dataloader) // gradient_accumulation_steps
+    total_steps = epochs * step_per_epoch
+    warmup_steps = int(total_steps * 0.1)
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=warmup_steps,
+        num_training_steps=total_steps,
+    )
+    for epoch in range(epochs):
+        model.train()
+        optimizer.zero_grad()
+        running_loss = 0.0
+        loop = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{epochs}")
+        for step, batch in enumerate(loop):
+            input_dict = dict_to_cuda(batch)
+            output_dict = model.forward(images=input_dict["images"],
+                                        images_clip=input_dict["images_clip"],
+                                        audio_features=input_dict["audio_feats"],
+                                        image_features=input_dict["image_feats"],
+                                        input_ids=input_dict["input_ids"],
+                                        labels=input_dict["labels"],
+                                        attention_masks=input_dict["attention_masks"],
+                                        masks_list=input_dict["masks"],
+                                        resize_list=input_dict["resizes"],
+                                        orgsize_list=input_dict["orgsizes"],
+                                        conversation_list=input_dict["convs"],
+                                        refs_num=input_dict["refs_num"],
+                                        fids=input_dict["fids"],
+                                        vids=input_dict["vids"],
+                                        contrast=args.ct_weight,
+                                        ref_ids=input_dict["ref_ids"],
+                                        epoch=epoch,
+                                        inference=False)
+            loss = output_dict["loss"]
+            loss = loss / gradient_accumulation_steps
+            loss.backward()
+            running_loss += loss.item()
+            if (step + 1) % gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                current_lr = scheduler.get_lr()[0]
+                loop.set_postfix(lr=current_lr, loss=running_loss / ((step + 1) / gradient_accumulation_steps))
+        print(f"  Epoch {epoch + 1}, Loss:{running_loss / ((step + 1) / gradient_accumulation_steps) :.4f}, Learning Rate:{scheduler.get_last_lr()[0]:.6f}")
+        with open(os.path.join(args.log_root, f'{args.name}.txt'), "a") as f:
+            f.write(f"Epoch {epoch}: running_loss {running_loss / len(train_dataloader) * gradient_accumulation_steps}  Learning Rate:{scheduler.get_last_lr()[0]:.6f}\n")
+    torch.save(model.state_dict(), os.path.join(args.checkpoint_root, f"{args.name}.pth"))
+    print(f"trained model saved as {args.name}.pth")
+    # ---------------test on seen & unseen ------------------------------------------
+    model.eval()
+    valuate(model, val_dataloader_s_refer, args, 'test_s_refer')
+    valuate(model, val_dataloader_u_refer, args, 'test_u_refer')
+    # ---------------test on Null ------------------------------------------
+    model.eval()
+    total_metric = 0
+    count = 0
+    for batch in tqdm(val_dataloader_n_refer, desc=f"Evaluating on test_n_refer"):
+        input_dict = dict_to_cuda(batch)
+        with torch.no_grad():
+            output_dict = model.forward(images=input_dict["images"],
+                                        images_clip=input_dict["images_clip"],
+                                        audio_features=input_dict["audio_feats"],
+                                        image_features=input_dict["image_feats"],
+                                        input_ids=input_dict["input_ids"],
+                                        labels=input_dict["labels"],
+                                        attention_masks=input_dict["attention_masks"],
+                                        masks_list=input_dict["masks"],
+                                        resize_list=input_dict["resizes"],
+                                        orgsize_list=input_dict["orgsizes"],
+                                        conversation_list=input_dict["convs"],
+                                        refs_num=input_dict["refs_num"],
+                                        fids=input_dict["fids"],
+                                        vids=input_dict["vids"],
+                                        contrast=args.ct_weight,
+                                        ref_ids=input_dict["ref_ids"],
+                                        inference=True)
+        pred_masks = output_dict["pred_masks"]  # list[B]:[num_seg, T, H, W]
+        gt_masks = output_dict["gt_masks"]  # list[B]:[num_seg, T, H, W]
+        for i in range(len(pred_masks)):
+            num_seg = pred_masks[i].shape[0]
+            T = pred_masks[i].shape[1]
+            null_metric = utility.metric_s_for_null(pred_masks[i])
+            total_metric += null_metric * num_seg * T
+            count += num_seg * T
+    print(f"\n  valuate on test_n_refer, metric: {total_metric/count}")
+    with open(os.path.join(args.log_root, f'{args.name}.txt'), "a") as f:
+        f.write(f"\n valuate on  test_n_refer:   metric {total_metric/count} \n")