import numpy as np import random from xtuner.utils import DEFAULT_IMAGE_TOKEN import re REGION_QUESTIONS = [ 'Can you provide me with a detailed description of the region in the picture marked by ?', "I'm curious about the region represented by in the picture. Could you describe it in detail?", 'What can you tell me about the region indicated by in the image?', "I'd like to know more about the area in the photo labeled . Can you give me a detailed description?", 'Could you describe the region shown as in the picture in great detail?', 'What details can you give me about the region outlined by in the photo?', 'Please provide me with a comprehensive description of the region marked with in the image.', 'Can you give me a detailed account of the region labeled as in the picture?', "I'm interested in learning more about the region represented by in the photo. Can you describe it in detail?", 'What is the region outlined by in the picture like? Could you give me a detailed description?', 'Can you provide me with a detailed description of the region in the picture marked by , please?', "I'm curious about the region represented by in the picture. Could you describe it in detail, please?", 'What can you tell me about the region indicated by in the image, exactly?', "I'd like to know more about the area in the photo labeled , please. Can you give me a detailed description?", 'Could you describe the region shown as in the picture in great detail, please?', 'What details can you give me about the region outlined by in the photo, please?', 'Please provide me with a comprehensive description of the region marked with in the image, please.', 'Can you give me a detailed account of the region labeled as in the picture, please?', "I'm interested in learning more about the region represented by in the photo. Can you describe it in detail, please?", 'What is the region outlined by in the picture like, please? Could you give me a detailed description?', ] def region_caption_conversation(descriptions): questions = [] answers = [] sampled_mark_token_ids = random.sample(list(range(100)), len(descriptions)) sampled_mark_tokens = [f'' for ii in sampled_mark_token_ids] for i, description in enumerate(descriptions): question = random.choice(REGION_QUESTIONS).strip().replace('', sampled_mark_tokens[i]) # question = random.choice(REGION_QUESTIONS).strip().replace('', f'region{i + 1} ') if i == 0: question = DEFAULT_IMAGE_TOKEN + '\n' + question questions.append(question) # answers.append(description.replace('', f'region{i + 1}')) answers.append(description.replace('', sampled_mark_tokens[i])) conversations = [] for question, answer in zip(questions, answers): conversations.append({'from': 'human', 'value': question}) conversations.append({'from': 'gpt', 'value': answer}) return conversations, sampled_mark_token_ids def region_caption_preprocess(example): descriptions = example['description'] # random select some labels if len(descriptions) >= 3: sampled_inds = np.random.choice( list(range(len(descriptions))), size=3, replace=False ) else: sampled_inds = list(range(len(descriptions))) selected_descriptions = [descriptions[idx] for idx in sampled_inds] selected_descriptions = [re.sub(r'<[^>]*>', '', item) for item in selected_descriptions] conversations, sampled_mark_token_ids = region_caption_conversation(selected_descriptions) example['conversations'] = conversations example['sampled_inds'] = sampled_inds example['sampled_mark_token_ids'] = sampled_mark_token_ids return example def osprey_region_caption_map_fn(example): # examples {'image', 'description'} example = region_caption_preprocess(example) return example def region_conversations_preprocess(example): conversations = example['conversations'] num_regions = example['num_regions'] for i, conversation in enumerate(conversations): if i == 0: role = conversation['from'] assert role == 'human' question = DEFAULT_IMAGE_TOKEN + '\n' + 'There are some regions:' for i in range(num_regions): question = question + ' region{} '.format(i + 1) if i + 1 == num_regions: question = question + '.\n' else: question = question + ',' question = question + conversation['value'].replace('<', '').replace('>', '').replace("regin", "region") conversation['value'] = question else: conversation['value'] = conversation['value'].replace('<', '').replace('>', '').replace("regin", "region") example['conversations'] = conversations return example def osprey_region_conversation_map_fn(example): # examples {'image', 'conversations'} example = region_conversations_preprocess(example) return example