Update README.md
Browse files
README.md
CHANGED
|
@@ -1,8 +1,145 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
|
| 5 |
+
#############################################################################################
|
| 6 |
+
ACTION_SPACE = """
|
| 7 |
+
CLICK:(x,y): Click on the element at the coordinate point (x,y) on the screen, e.g., CLICK:(1980,224).
|
| 8 |
+
TYPE:typed_text: An action of typing a piece of text, e.g., TYPE:"Macbook-Pro 16G Black".
|
| 9 |
+
COMPLETE: The goal has been completed in the current screen state.
|
| 10 |
+
SCROLL:UP/DOWN/LEFT/RIGHT: Scroll in a specific direction, e.g., SCROLL:UP.
|
| 11 |
+
LONG_PRESS:(x,y): Long press at a specific point (x,y) on the screen, e.g., LONG_PRESS:(345,2218).
|
| 12 |
+
BACK: Go back to the previous screen, e.g, BACK.
|
| 13 |
+
HOME: Go to the home screen, e.g., HOME.
|
| 14 |
+
=========================================
|
| 15 |
+
OTHER_CUSTOM_ACTIONS: ...
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
INFERENCE_INSTRUCTION = f"""
|
| 19 |
+
You are a skilled assistant, interacting with the screen to accomplish the user's goals.
|
| 20 |
+
Here is the action space:
|
| 21 |
+
{ACTION_SPACE}
|
| 22 |
+
Your overall goal is: <goal>(goal)</goal>
|
| 23 |
+
Actions completed at previous steps: <history>(history)</history>
|
| 24 |
+
|
| 25 |
+
The output format should be as follows:
|
| 26 |
+
<think>Analyze step by step based on guidance and screen state to choose the action.</think>
|
| 27 |
+
<answer>The action you finally choose from "action space".</answer>"""
|
| 28 |
+
|
| 29 |
+
ACT2SUM_INSTRUCTION = f"""
|
| 30 |
+
Step-by-step GUI navigation task. Briefly summarize the current action.
|
| 31 |
+
Action space:
|
| 32 |
+
{ACTION_SPACE}
|
| 33 |
+
Goal: <goal>(goal)</goal>
|
| 34 |
+
Current action: <action>(action)</action>
|
| 35 |
+
|
| 36 |
+
Output Format: <summary>One-sentence summary of the action based on the screen image.</summary>"""
|
| 37 |
+
#############################################################################################
|
| 38 |
+
def execute(meta_data):
|
| 39 |
+
goal, hist, img_url = meta_data
|
| 40 |
+
inference_temp = INFERENCE_INSTRUCTION.replace("(goal)", goal).replace("(history)", hist)
|
| 41 |
+
pred = chat_HAR_GUI_3B(img_url, inference_temp)
|
| 42 |
+
return pred
|
| 43 |
+
|
| 44 |
+
def act2sum_fn(meta_data):
|
| 45 |
+
goal, cur_action, img_url = meta_data
|
| 46 |
+
act2sum_temp = ACT2SUM_INSTRUCTION.replace("(goal)", goal).replace("(action)", cur_action)
|
| 47 |
+
pred = chat_HAR_GUI_3B(img_url, act2sum_temp)
|
| 48 |
+
# pred = chat_72B(img_url, act2sum_temp)
|
| 49 |
+
return pred
|
| 50 |
+
#############################################################################################
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
url = "http://localhost:8000/v1/chat/completions"
|
| 54 |
+
headers = {
|
| 55 |
+
"Content-Type": "application/json"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
def chat_HAR_GUI_3B(img_url, query):
|
| 59 |
+
content = []
|
| 60 |
+
content.append({"type": "image_url", "image_url": {"url": img_url}})
|
| 61 |
+
content.append({"type": "text", "text": query})
|
| 62 |
+
data = {
|
| 63 |
+
"model": "Qwen2.5-VL-3B-Instruct",
|
| 64 |
+
"messages": [
|
| 65 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 66 |
+
{"role": "user", "content": content}
|
| 67 |
+
],
|
| 68 |
+
"temperature":0}
|
| 69 |
+
|
| 70 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 71 |
+
response = response.json()
|
| 72 |
+
response = response['choices'][0]['message']['content']
|
| 73 |
+
|
| 74 |
+
return response
|
| 75 |
+
|
| 76 |
+
def chat_72B(img_url, query):
|
| 77 |
+
content = []
|
| 78 |
+
content.append({"type": "image_url", "image_url": {"url": img_url}})
|
| 79 |
+
content.append({"type": "text", "text": query})
|
| 80 |
+
data = {
|
| 81 |
+
"model": "Qwen2.5-VL-72B-Instruct",
|
| 82 |
+
"messages": [
|
| 83 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 84 |
+
{"role": "user", "content": content}
|
| 85 |
+
],
|
| 86 |
+
"temperature":0}
|
| 87 |
+
|
| 88 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 89 |
+
response = response.json()
|
| 90 |
+
response = response['choices'][0]['message']['content']
|
| 91 |
+
|
| 92 |
+
return response
|
| 93 |
+
|
| 94 |
+
## You can also use its model loading method, such as the following (or use the Swift inference framework for faster speed),
|
| 95 |
+
##################################################################################
|
| 96 |
+
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
| 97 |
+
# MAX_IMAGE_PIXELS = 2048*28*28
|
| 98 |
+
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 99 |
+
# "./models/HAR-GUI-3B",
|
| 100 |
+
# torch_dtype=torch.bfloat16,
|
| 101 |
+
# attn_implementation="flash_attention_2",
|
| 102 |
+
# device_map="auto"
|
| 103 |
+
# )
|
| 104 |
+
# processor = AutoProcessor.from_pretrained("./HAR-GUI-3B", max_pixels=MAX_IMAGE_PIXELS, padding_side="left")
|
| 105 |
+
##################################################################################
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
|
| 109 |
+
folder = "./your_data_folder/"
|
| 110 |
+
episodes = json.load(open(folder + "your_data_path.json", "r"))
|
| 111 |
+
k = 4
|
| 112 |
+
inference_data = []
|
| 113 |
+
for i, episode in tqdm(enumerate(episodes)):
|
| 114 |
+
hist_horizon = []
|
| 115 |
+
for t, step in tqdm(enumerate(episode)):
|
| 116 |
+
cur_hist = ""
|
| 117 |
+
# You can also build ADB pipeline for online execution #https://developer.android.com/tools/adb
|
| 118 |
+
goal, gt_action, img, ep_id = step["goal"], step["ground_truth"], step["image_path"], step["episode_id"]
|
| 119 |
+
img_url = 'http://localhost:6666/' + img
|
| 120 |
+
|
| 121 |
+
if len(hist_horizon) == 0:
|
| 122 |
+
cur_hist = "This is the task's initial state."
|
| 123 |
+
else:
|
| 124 |
+
for i, act2sum_ in enumerate(hist_horizon[-k:]):
|
| 125 |
+
cur_hist += 'Step' + str(i+1) + ': ' + act2sum_ + ".\n"
|
| 126 |
+
|
| 127 |
+
pred = execute((goal, cur_hist, img_url))
|
| 128 |
+
think, pred_action = pred.split("<think>")[-1].split("</think>")[0].strip(), pred.split("<answer>")[-1].split("</answer>")[0].strip()
|
| 129 |
+
|
| 130 |
+
#############
|
| 131 |
+
# act2sum = act2sum_fn((goal, gt_action, img_url)) # Can be used for static inference
|
| 132 |
+
act2sum = act2sum_fn((goal, pred_action, img_url)) # Can be used for online inference
|
| 133 |
+
hist_horizon.append(act2sum.split("<summary>")[-1].split("</summary>")[0])
|
| 134 |
+
|
| 135 |
+
inference_data.append({
|
| 136 |
+
"episode_id": ep_id,
|
| 137 |
+
"image_path": img,
|
| 138 |
+
"goal": goal,
|
| 139 |
+
"pred": pred,
|
| 140 |
+
"history": cur_hist,
|
| 141 |
+
"ground_truth": gt_action
|
| 142 |
+
})
|
| 143 |
+
# evaluate(inference_data)
|
| 144 |
+
with open("your_saving_path.json", "w") as f:
|
| 145 |
+
f.write(json.dumps(inference_data, indent=4))
|