{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\"\"\"load model\"\"\"\n", "import torch\n", "from PIL import Image, ImageDraw\n", "from qwen_vl_utils import process_vision_info\n", "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n", "import os\n", "import json\n", "import codecs \n", "from peft import PeftModel\n", "import argparse\n", "import random \n", "import re\n", "\n", "\n", "max_pixels_temp = 160*28*28\n", "max_pixels_narr = 760*28*28\n", "min_pixels_narr = 240*28*28\n", "\n", "\n", "\n", "model = Qwen2VLForConditionalGeneration.from_pretrained(\n", " 'FRank62Wu/ShowUI-Narrator', torch_dtype=\"auto\", device_map=\"cuda\"\n", ")\n", "\n", "\n", "processor = AutoProcessor.from_pretrained('FRank62Wu/ShowUI-Narrator') \n", "processor.tokenizer.pad_token = processor.tokenizer.eos_token" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<6> and <8>.\n" ] } ], "source": [ "\n", "_SYSTEM_PROMPT='For the given video frames of a GUI action, The frames are decribed in the format of <0> to <{N}>.'\n", "\n", "\n", "\n", "_SYSTEM_PROMPT_NARR='''You are an ai assistant to narrate the action of the user for the video frames in the following detail.\n", "'Action': The type of action\n", "'Element': The target of the action\n", "'Source': The starting position (Applicable for action type: Drag)\n", "'Destination': The ending position (Applicable for action type: Drag)\n", "'Purpose': The intended result of the action\n", "The Action include left click, right click, double click, drag, or Keyboard type.\n", "'''\n", "\n", "\n", "Action_no_reference_grounding = [\n", " 'Describe the start frame and the end frame of the action in this video?',\n", " 'When Did the action happened in this video? Tell me the start frame and the end frame.',\n", " 'Locate the start and the end frame of the action in this video',\n", " \"Observe the cursor in this GUI video, marking start and end frame of the action in video frames.\"\n", "]\n", "\n", "\n", "Dense_narration_query = ['Narrate the action in the given video.',\n", " 'Describe the action of the user in the given frames',\n", " 'Describe the action in this video.',\n", " 'Narrate the action detail of the user in the video.']\n", "\n", "\n", "\n", "path_to_data =''\n", "\n", "query = _SYSTEM_PROMPT.format(N=9) + ' ' + random.choice(Action_no_reference_grounding)\n", "messages = [\n", " {\n", " 'role': 'user', \n", " 'content': [\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/0_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/1_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/2_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/3_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/4_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/5_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/6_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/7_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/8_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"image\", \"image\": f\"{path_to_data}/storage/test_benchmark_Act2Cap/303/9_crop.png\",\"max_pixels\": max_pixels_temp},\n", " {'type':\"text\",'text': query},\n", " ]\n", " } \n", " ]\n", "\n", "\n", "\n", "## round_1 for temporal grounding\n", "text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True,\n", " )\n", " \n", "image_inputs, video_inputs = process_vision_info(messages)\n", "inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", " )\n", "inputs = inputs.to(\"cuda\")\n", "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "output_text = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")[0]\n", "\n", "print(output_text)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\"Action\": \"double click\", \"Element\": \"sc2 trans shape button\", \"Source\": null, \"Destination\": null, \"Purpose\": \" Select the SC2 Trans Shape.\"}\n" ] } ], "source": [ "# round_2 for dense narration caption\n", "try:\n", " matches = re.search(r\"<(\\w+)>.*?<(\\w+)>\", output_text)\n", " s1, e1 = int(matches.group(1)), int(matches.group(2))\n", "except:\n", " s1, e1 =0, 9\n", " \n", "\n", "query = _SYSTEM_PROMPT_NARR + ' ' + random.choice(Dense_narration_query)\n", "\n", "selected_images = []\n", "\n", "if e1-s1<=3:\n", " pixels_narr = max_pixels_narr\n", "else:\n", " pixels_narr = max_pixels_narr *3 /(e1-s1+1)\n", " \n", " \n", "for idx, each in enumerate(messages[0]['content']):\n", " if idx >= s1 and idx <= e1:\n", " new_image = each.copy()\n", " new_image['max_pixels'] = int(pixels_narr)\n", " selected_images.append(new_image)\n", " \n", " \n", "messages = [\n", " {\n", " 'role': 'user', \n", " 'content':selected_images+ [{'type':\"text\",'text': query},\n", " ] \n", " } \n", " ]\n", "\n", "text = processor.apply_chat_template(\n", " messages, tokenize=False, add_generation_prompt=True,\n", " )\n", " \n", "image_inputs, video_inputs = process_vision_info(messages)\n", "inputs = processor(\n", " text=[text],\n", " images=image_inputs,\n", " videos=video_inputs,\n", " padding=True,\n", " return_tensors=\"pt\",\n", " )\n", "inputs = inputs.to(\"cuda\")\n", "generated_ids = model.generate(**inputs, max_new_tokens=128)\n", "generated_ids_trimmed = [\n", " out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n", "]\n", "output_text_narration = processor.batch_decode(\n", " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", ")[0]\n", "\n", "print(output_text_narration)\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "wqc_qwen2vl", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }