File size: 22,739 Bytes
be13417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
"""
 Copyright (c) 2023, salesforce.com, inc.
 All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""

import re

from lavis.common.registry import registry
from lavis.processors.base_processor import BaseProcessor
from lavis.processors.randaugment import RandomAugment
from omegaconf import OmegaConf
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import random
import nltk
import pickle

TEMPLATES = {"image":{}, "pc":{}, "audio":{}, "video":{}}


TEMPLATES["video"]["qa"] = [
"Given the video, {}",
"Q: {} A:",
"Answer the following question based on the video: {}",
"Question: {} Answer:",
"How would you answer {} after watching the video?",
"What is the answer to the question {} after viewing the video?",
"Answer the question based on the video. Question: {} Answer: ",
"Instruction: Answer the following question by reference to the input video. Question: {} Answer:",
"Given the video, what is the answer to the question {}?",
"What's your response to the query {} after watching the video?",
"Please provide an answer to {} after watching the video",
"Respond to the query {} based on the video",
"Based on the given video, respond to {}",
"Question: {} What's your response after watching the video?",
"Consider the following query: {}",
"Could you help answer the question {}?",
"Referencing the provided video, can you answer the question {}?",
"With respect to the video shown, please answer {}",
"What's your answer to {} in the context of the provided video?",
"Question (refer to the video for context): {} Answer:",
"In response to the question {}, what would your answer be after viewing the video?"
]

TEMPLATES["video"]["classification"] = [
"Identify the objects in this video:",
"What objects can you spot in the clip?",
"Classify the objects in the video:",
"Name the objects visible in the video:",
"Identify and classify the objects in this video:",
"What type of objects are in the video?",
"Classify the main object in the clip:",
"Identify the primary object in this video:",
"Name the main object visible in the clip:",
"What is the central object in the video?",
"Can you classify the central object in this video?",
"Determine the type of the main object in this video:",
"What objects do you identify in the video?",
"Please classify the objects in the video:",
"Please identify the main object in the video:",
"What kind of object is displayed in the video?",
"What object is predominantly featured in the video?",
"Classify the object that is the focus of this video:",
"What is the main object in the video?",
"Identify the primary object in the video:"
]

TEMPLATES["video"]["caption"] = [
"A short caption for the video:",
"A short description of the video:",
"A video of",
"A video that shows",
"Describe the video briefly.",
"Write a description for the video.",
"Provide a description of what is presented in the video.",
"Briefly describe the content of the video.",
"Can you briefly explain what you see in the video?",
"Could you use a few words to describe what you perceive in the video?",
"Please provide a short description of the video.",
"Using language, provide a short account of the video.",
"Use a few words to illustrate what is happening in the video."
]

TEMPLATES["image"]["qa"] = [
    "{}",
    "Q: {} A:",
    "Answer the following question: {}",
    "Question: {} Answer:",
    "How would you answer {}?",
    "What is the answer to the question {}?",
    "Answer the question based on the image. Question: {} Answer: ",
    "Instruction: Answer the following question by reference to the input image. Question: {} Answer:",
    "Given the photo, what is the answer to the question {}?",
    "What's your response to the query {}?",
    "Please provide an answer to {}",
    "Respond to the query {}",
    "Based on the given image, respond to {}",
    "Question: {} What's your response?",
    "Consider the following query: {}",
    "Could you help answer the question {}?",
    "Referencing the provided image, can you answer the question {}?",
    "With respect to the image shown, please answer {}",
    "What's your answer to {} in the context of the provided image?",
    "Question (refer to the image for context): {} Answer:",
    "In response to the question {}, what would your answer be?",
    "Considering the details of the image, {}",
    "Contemplating the image, {}",
    "Upon examining the image, how would you address {}?",
    "Using the image as a guideline, {}",
    "From the perspective of the image, {}",
    "Please provide a detailed answer to {} based on your insights from the image.",
    "If you were to use the image as a sole reference, how would you address {}?",
    "Factoring in the image's details, what's your take on {}?",
    "In light of the provided image, {}",
    "Relying solely on the image, how would you tackle {}?",
    "Referencing the intricate details of the image, {}",
    "Giving heed to the image, how would you reply to {}?",
    "In the context of the image's attributes, {}",
    "Having the image in mind, provide an answer to {}.",
    "Reflecting upon the image, {}",
    "Question: {}. Keeping the image in mind, please provide an answer.",
    "By referencing the key aspects of the image, {}",
    "While considering the intricacies of the image, how would you approach answering {}?",
    "Having thoroughly reviewed the image, please respond to {}.",
    "Let's focus on the image. In its context, how would you interpret {}?",
    "With the nuances of the image at the forefront, {}",
    "Taking a cue from the image, {}",
    "Bearing in mind the characteristics of the image, how would you elucidate {}?",
    "Gauging by the image, {}",
    "In accordance with the image's specifications, {}",
    "Drawing insights from the image, {}"
]

TEMPLATES["image"]["classification"] = [
    "Identify the objects in this image:",
    "What objects can you spot in the picture?",
    "Classify the objects in the photo:",
    "Name the objects visible in the image:",
    "Identify and classify the objects in this image:",
    "What type of objects are in the photo?",
    "Classify the main object in the picture:",
    "Identify the primary object in this image:",
    "Name the main object visible in the photo:",
    "What is the central object in the image?",
    "Can you classify the central object in this photo?",
    "Determine the type of the main object in this picture:",
    "What objects do you identify in the photo?",
    "Please classify the objects in the image:",
    "Please identify the main object in the picture:",
    "What kind of object is displayed in the image?",
    "What object is predominantly featured in the photo?",
    "Classify the object that is the focus of this image:",
    "What is the main object in the picture?",
    "Identify the primary object in the photo:"
]

TEMPLATES["image"]["caption"] = [
    "A short caption:",
    "A short description:",
    "A short image caption:",
    "A short image description:",
    "A photo of",
    "A photo that shows",
    "A picture of",
    "A picture that shows",
    "An image of",
    "An image that shows",
    "Write a short description.",
    "Write a description for the image.",
    "Provide a description of what is presented in the image.",
    "Briefly describe the content of the image.",
    "Can you briefly explain what you see in the image?",
    "Could you use a few words to describe what you perceive in the image?",
    "Please provide a short description of the image.",
    "Using language, provide a short account of the image.",
    "Use a few words to illustrate what is happening in the photo."
    "Write a description for the photo.",
    "Provide a description of what is presented in the photo.",
    "Briefly describe the content of the photo.",
    "Can you briefly explain what you see in the photo?",
    "Could you use a few words to describe what you perceive in the photo?",
    "Please provide a short description of the picture.",
    "Using language, provide a short account of the picture.",
    "Use a few words to illustrate what is happening in the picture.",
    "Write a description for the picture.",
    "Provide a description of what is presented in the picture.",
    "Briefly describe the content of the picture.",
    "Can you briefly explain what you see in the picture?",
    "Could you use a few words to describe what you perceive in the picture?",
    "Please provide a short description of the picture.",
    "Using language, provide a short account of the picture.",
    "Use a few words to illustrate what is happening in the picture.",
    "What can you tell me about this image?",
    "Please detail what's in this image.",
    "Narrate the scene captured in this image.",
    "Describe the main elements in this photo.",
    "Tell me what stands out in this picture.",
    "What is the focal point of this image?",
    "Please identify the primary subjects in this photo.",
    "Detail the atmosphere or mood of this image.",
    "Paint a verbal picture based on this image.",
    "Elaborate on the contents of this photo.",
    "How would you introduce this image to someone?",
    "Share a summary of this image.",
    "What are the notable features of this image?",
    "Highlight the main themes present in this photo.",
    "Sketch the scene using words.",
    "Enumerate the elements you see in this image.",
    "Characterize the essence of this picture.",
    "Relate the story depicted in this image.",
    "Describe this image as if speaking to someone who can't see it."
    ]



TEMPLATES["audio"]["caption"] = [
    "A short caption:",
    "A short description:",
    "An audio of",
    "An audio that shows",
    "Write a short description.",
    "Write a description for the audio.",
    "Provide a description of what is presented in the audio.",
    "Briefly describe the content of the audio.",
    "Can you briefly explain what you hear in the audio?",
    "Could you use a few words to describe what you perceive in the audio?",
    "Please provide a short description of the audio.",
    "Using language, provide a short account of the audio.",
    "Use a few words to illustrate what is happening in the audio.",
    "Describe briefly the contents of the audio.",
    "Please provide a brief summary of the audio.",
    "What does the audio contain?",
    "What can you hear in the audio?",
    "What sounds are present in the audio?",
    "Summarize the audio in a few words.",
    "Write a brief summary of the audio content.",
    "Could you provide a concise explanation of the audio's contents?",
    "Describe what the audio represents.",
    "What is the audio depicting?",
    "In a few words, describe what you hear in the audio."
]


TEMPLATES["audio"]["classification"] = [
    "Classify the following audio:",
    "What is the category of this audio clip?",
    "Identify the content of the following audio:",
    "Provide a classification for the audio.",
    "Analyze and categorize the following audio.",
    "Describe the category of the given audio.",
    "Determine the type of this audio clip.",
    "Can you classify what you hear in the audio?",
    "What type of audio is this?",
    "How would you classify this audio clip?",
    "Please identify the category of the following audio:",
    "What category does the following audio fall into?",
    "Classify the sounds in this audio clip."
]

TEMPLATES["audio"]["qa"] = [
    "{}",
    "Question: {} Answer:",
    "Q: {} A:",
    "Based on the audio, {}",
    "Answer the following question based on the audio: {}",
    "Question: {} Provide an answer based on the audio.",
    "How would you answer {} based on the audio?",
    "What is the answer to the question {} using the audio as a reference?",
    "Answer the question using the audio. Question: {} Answer: ",
    "Instruction: Answer the following question by referencing the audio. Question: {} Answer:",
    "Given the audio, what is the answer to the question {}?",
    "What's your response to the query {} considering the audio?",
    "Please provide an answer to {} using the audio as context.",
    "Respond to the query {} based on the audio content.",
    "Based on the provided audio, respond to {}",
    "Question: {} What's your response using the audio for context?",
    "Consider the following query and the audio: {}",
    "Could you help answer the question {} using the audio as reference?",
    "Referencing the provided audio, can you answer the question {}?",
    "With respect to the audio provided, please answer {}",
    "What's your answer to {} in the context of the provided audio?",
    "Question (refer to the audio for context): {} Answer:",
    "In response to the question {}, what would your answer be based on the audio?",
    "Given the audio, how would you respond to {}?",
    "Taking the audio into consideration, what is your response to {}?",
    "Based on the audio, how would you answer {}?"
]

TEMPLATES["pc"]["caption"] = [
    "A short caption:",
    "A short description:",
    "A short 3D caption:",
    "A short 3D description:",
    "A 3D model of",
    "A 3D model that shows",
    "Write a short description.",
    "Write a description for the 3D model.",
    "Provide a description of what is presented in the 3D model.",
    "Briefly describe the content of the 3D model.",
    "Can you briefly explain what you see in the 3D model?",
    "Could you use a few words to describe what you perceive in the 3D model?",
    "Please provide a short description of the 3D model.",
    "Using language, provide a short account of the 3D model.",
    "Use a few words to illustrate what is happening in the 3D model.",
    "Describe briefly the contents of the 3D model.",
    "Please provide a brief summary of the 3D model.",
    "What does the 3D model contain?",
    "What can you identify in the 3D model?",
    "What structures are present in the 3D model?",
    "Summarize the 3D model in a few words.",
    "Write a brief summary of the 3D model content.",
    "Could you provide a concise explanation of the 3D model's contents?",
    "Describe what the 3D model represents.",
    "What is the 3D model depicting?",
    "In a few words, describe what you see in the 3D model.",
    "What can you tell me about this image?",
    "Please detail what's in this 3D model.",
    "Narrate the scene captured in this 3D model.",
    "Describe the main elements in this 3D model.",
    "Tell me what stands out in this 3D model.",
    "What is the focal point of this 3D model?",
    "Please identify the primary subjects in this 3D model.",
    "Detail the atmosphere or mood of this 3D model.",
    "Paint a verbal picture based on this 3D model.",
    "Elaborate on the contents of this 3D model.",
    "How would you introduce this 3D model to someone?",
    "Share a summary of this 3D model.",
    "What are the notable features of this 3D model?",
    "Highlight the main themes present in this 3D model.",
    "Sketch the scene using words.",
    "Enumerate the elements you see in this 3D model.",
    "Characterize the essence of this 3D model.",
    "Relate the story depicted in this 3D model.",
    "Describe this 3D model as if speaking to someone who can't see it."
]

TEMPLATES["pc"]["classification"] = [
    "Classify the following 3D model:",
    "What is the category of this 3D model?",
    "Identify the content of the following 3D model:",
    "Provide a classification for the 3D model.",
    "Analyze and categorize the following 3D model.",
    "Describe the category of the given 3D model.",
    "Determine the type of this 3D model.",
    "Can you classify what you see in the 3D model?",
    "What type of 3D model is this?",
    "How would you classify this 3D model?",
    "Please identify the category of the following 3D model:",
    "What category does the following 3D model fall into?",
    "Classify the structures in this 3D model."
]

TEMPLATES["pc"]["qa"] = [
    "{}",
    "Question: {} Answer:",
    "Q: {} A:",
    "Based on the 3D model, {}",
    "Answer the following question based on the 3D model: {}",
    "Question: {} Provide an answer based on the 3D model.",
    "How would you answer {} based on the 3D model?",
    "What is the answer to the question {} using the 3D model as a reference?",
    "Answer the question using the 3D model. Question: {} Answer: ",
    "Instruction: Answer the following question by referencing the 3D model. Question: {} Answer:",
    "Given the 3D model, what is the answer to the question {}?",
    "What's your response to the query {} considering the 3D model?",
    "Please provide an answer to {} using the 3D model as context.",
    "Respond to the query {} based on the 3D model content.",
    "Based on the provided 3D model, respond to {}",
    "Question: {} What's your response using the 3D model for context?",
    "Consider the following query and the 3D model: {}",
    "Could you help answer the question {} using the 3D model as reference?",
    "Referencing the provided 3D model, can you answer the question {}?",
    "With respect to the 3D model provided, please answer {}",
    "What's your answer to {} in the context of the provided 3D model?",
    "Question (refer to the 3D model for context): {} Answer:",
    "In response to the question {}, what would your answer be based on the 3D model?",
    "Given the 3D model, how would you respond to {}?",
    "Taking the 3D model into consideration, what is your response to {}?",
    "Based on the 3D model, how would you answer {}?",
    "Considering the details of the 3D model, {}",
    "Contemplating the 3D model, {}",
    "Upon examining the 3D model, how would you address {}?",
    "Using the 3D model as a guideline, {}",
    "From the perspective of the 3D model, {}",
    "Please provide a detailed answer to {} based on your insights from the 3D model.",
    "If you were to use the 3D model as a sole reference, how would you address {}?",
    "Factoring in the 3D model's details, what's your take on {}?",
    "In light of the provided 3D model, {}",
    "Relying solely on the 3D model, how would you tackle {}?",
    "Referencing the intricate details of the 3D model, {}",
    "Giving heed to the 3D model, how would you reply to {}?",
    "In the context of the 3D model's attributes, {}",
    "Having the 3D model in mind, provide an answer to {}.",
    "Reflecting upon the 3D model, {}",
    "Question: {}. Keeping the 3D model in mind, please provide an answer.",
    "By referencing the key aspects of the 3D model, {}",
    "While considering the intricacies of the 3D model, how would you approach answering {}?",
    "Having thoroughly reviewed the 3D model, please respond to {}.",
    "Let's focus on the 3D model. In its context, how would you interpret {}?",
    "With the nuances of the 3D model at the forefront, {}",
    "Taking a cue from the 3D model, {}",
    "Bearing in mind the characteristics of the 3D model, how would you elucidate {}?",
    "Gauging by the 3D model, {}",
    "In accordance with the 3D model's specifications, {}",
    "Drawing insights from the 3D model, {}"
]


@registry.register_processor("blip_instruction")
class BlipInstructionProcessor(BaseProcessor):
    def __init__(self, prompt, max_words, modality, task, cmu_dict_path):
        self.prompt = prompt
        self.max_words = max_words
        self.modality = modality
        self.task = task
        if task == 'classification':
            ## download cmu_dict and save it as pickle file 
            if cmu_dict_path:
                self.pronounciations = pickle.load(open(cmu_dict_path, 'rb'))
            else:
                # try:
                #     nltk.download('cmudict')
                # except:
                #     pass
                from nltk.corpus import cmudict
                self.pronounciations = cmudict.dict()
            
           
    def classification_output(self, label):
        if self.starts_with_vowel_sound(label) or label[0] in 'aeiou':
            label = 'an ' + label
        else:
            label = 'a ' + label
        if self.modality == "audio":
            prompts = ["This is ", "It is ", "I hear ", "The audio is ", "This is the sound of ", ""]
        else:
            prompts = ["", "This is ", "It is "]
        caption = random.choice(prompts) +  label
        return caption.lower()
    
    def starts_with_vowel_sound(self,word):
        for syllables in self.pronounciations.get(word, []):
            return syllables[0][-1].isdigit()  # use only the first one
                

    def __call__(self, caption):
        if self.task == 'eval':
            if caption == "":
                caption = self.prompt
            else:
                caption = self.pre_caption(caption)
            return caption
        if caption == "" and self.task == 'caption':
            caption = self.prompt + random.choice(TEMPLATES[self.modality][self.task])
        elif self.task == 'qa':
            caption = self.prompt + random.choice(TEMPLATES[self.modality][self.task]).format(caption).replace('??', '?')
        elif self.task == 'classification':
            if caption == "":
                caption = self.prompt + random.choice(TEMPLATES[self.modality][self.task])
            else:
                caption = self.classification_output(caption)
        else:
            caption = self.pre_caption(caption)

        return caption.lower().strip()

    @classmethod
    def from_config(classification, cfg=None):
        if cfg is None:
            cfg = OmegaConf.create()

        prompt = cfg.get("prompt", "")
        max_words = cfg.get("max_words", 50)
        modality = cfg.get("modality", 'image')
        task = cfg.get("task", 'caption')
        cmu_dict_path = cfg.get("cmu_dict_path", None)
        return classification(prompt=prompt, max_words=max_words, modality=modality, task=task, cmu_dict_path=cmu_dict_path)

    def pre_caption(self, caption):
        if isinstance(caption,list):
            caption = random.choice(caption)
        caption = re.sub(
            r"([.!\"()*#:;~])",
            " ",
            caption.lower(),
        )
        caption = re.sub(
            r"\s{2,}",
            " ",
            caption,
        )
        caption = caption.rstrip("\n")
        caption = caption.strip(" ")

        # truncate caption
        caption_words = caption.split(" ")
        if len(caption_words) > self.max_words:
            caption = " ".join(caption_words[: self.max_words])

        return caption