In [None]:
"""load model"""
import torch
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import os
import json
import codecs 
from peft import PeftModel
import argparse
import random 
import re


max_pixels_temp = 160*28*28
max_pixels_narr = 760*28*28
min_pixels_narr = 240*28*28



model = Qwen2VLForConditionalGeneration.from_pretrained(
 'FRank62Wu/ShowUI-Narrator', torch_dtype="auto", device_map="cuda"
)


processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator') 
processor.tokenizer.pad_token = processor.tokenizer.eos_token

In [None]:

_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'



_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.
'Action': The type of action
'Element': The target of the action
'Source': The starting position (Applicable for action type: Drag)
'Destination': The ending position (Applicable for action type: Drag)
'Purpose': The intended result of the action
The Action include left click, right click, double click, drag, or Keyboard type.
'''


Action_no_reference_grounding = [
 'Describe the start frame and the end frame of the action in this video?',
 'When Did the action happened in this video? Tell me the start frame and the end frame.',
 'Locate the start and the end frame of the action in this video',
 "Observe the cursor in this GUI video, marking start and end frame of the action in video frames."
]


Dense_narration_query = ['Narrate the action in the given video.',
 'Describe the action of the user in the given frames',
 'Describe the action in this video.',
 'Narrate the action detail of the user in the video.']



path_to_data =''

query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)
messages = [
 {
 'role': 'user', 
 'content': [
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png","max_pixels": max_pixels_temp},
 {'type':"image", "image": f"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png","max_pixels": max_pixels_temp},
 {'type':"text",'text': query},
 ]
 } 
 ]



## round_1 for temporal grounding
text = processor.apply_chat_template(
 messages, tokenize=False, add_generation_prompt=True,
 )
 
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
 text=[text],
 images=image_inputs,
 videos=video_inputs,
 padding=True,
 return_tensors="pt",
 )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text)

<6> and <8>.


In [3]:
# round_2 for dense narration caption
try:
 matches = re.search(r"<(\w+)>.*?<(\w+)>", output_text)
 s1, e1 = int(matches.group(1)), int(matches.group(2))
except:
 s1, e1 =0, 9
 

query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)

selected_images = []

if e1-s1<=3:
 pixels_narr = max_pixels_narr
else:
 pixels_narr = max_pixels_narr *3 /(e1-s1+1)
 
 
for idx, each in enumerate(messages[0]['content']):
 if idx >= s1 and idx <= e1:
 new_image = each.copy()
 new_image['max_pixels'] = int(pixels_narr)
 selected_images.append(new_image)
 
 
messages = [
 {
 'role': 'user', 
 'content':selected_images+ [{'type':"text",'text': query},
 ] 
 } 
 ]

text = processor.apply_chat_template(
 messages, tokenize=False, add_generation_prompt=True,
 )
 
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
 text=[text],
 images=image_inputs,
 videos=video_inputs,
 padding=True,
 return_tensors="pt",
 )
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text_narration = processor.batch_decode(
 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(output_text_narration)
 

{"Action": "double click", "Element": "sc2 trans shape button", "Source": null, "Destination": null, "Purpose": " Select the SC2 Trans Shape."}
