# Copyright (c) Meta Platforms, Inc. and affiliates. """ Define conversation format for each training phases and language models. Modified from LLaVA codebase: https://github.com/haotian-liu/LLaVA/blob/main/llava/conversation.py NOTE: - an example of required json format is: data = { "image": IMAGE_PATH, or "images": LIST of IMAGE_PATH, "conversations": [ {"from": "human", "value": "hello"}, {"from": "assistant", "value": "Hi, how can I help you today?"}, {"from": "human", "value": "Who are you?"}, {"from": "assistant", "value": "I am a multimodal large language model created by FAIR. I can assist you with questions related to images and videos."}, ] } """ import copy from dataclasses import dataclass from typing import Callable, Dict, List, Union @dataclass class Conversation: system: str conversations: list bos_token: str sep_system: str sep_question: str sep_answer: str place_image_token: Callable image_token: str = "<|image|>" pre_system: str = "" pre_question: str = "" pre_answer: str = "" eos_token: str = "" # TODO (Maaz): Is there a better name for 'num_patches'. It represents number of vision tokens per image/frame. def get_conversation_dict_list( self, num_images: int = 1, num_patches: int = 144, media_type: str = "image" ) -> List[Dict]: """ Each turn of conversation is a dict with source and target keys. """ conv_dict_list = [] sys_text = self.pre_system + self.system + self.sep_system is_first = True if media_type == "multi_image": # For multiple interleave images, we keep the tags at the same place as in the original text. # However replace in annotations with the self.image_token for conversation in self.conversations: if conversation["from"] == "human": conversation["value"] = conversation["value"].replace( "", self.image_token * num_patches ) else: # Some annotations already have image tags. remove and add ourself. self.conversations[0]["value"] = ( self.conversations[0]["value"] .replace("\n", "") .replace("\n", "") .replace("", "") .replace("