BigTaige commited on
Commit
56253ee
·
verified ·
1 Parent(s): d381cec

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +145 -8
README.md CHANGED
@@ -1,8 +1,145 @@
1
- ---
2
- license: mit
3
- base_model:
4
- - Qwen/Qwen2.5-VL-3B-Instruct
5
- pipeline_tag: image-text-to-text
6
- tags:
7
- - not-for-all-audiences
8
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from tqdm import tqdm
4
+
5
+ #############################################################################################
6
+ ACTION_SPACE = """
7
+ CLICK:(x,y): Click on the element at the coordinate point (x,y) on the screen, e.g., CLICK:(1980,224).
8
+ TYPE:typed_text: An action of typing a piece of text, e.g., TYPE:"Macbook-Pro 16G Black".
9
+ COMPLETE: The goal has been completed in the current screen state.
10
+ SCROLL:UP/DOWN/LEFT/RIGHT: Scroll in a specific direction, e.g., SCROLL:UP.
11
+ LONG_PRESS:(x,y): Long press at a specific point (x,y) on the screen, e.g., LONG_PRESS:(345,2218).
12
+ BACK: Go back to the previous screen, e.g, BACK.
13
+ HOME: Go to the home screen, e.g., HOME.
14
+ =========================================
15
+ OTHER_CUSTOM_ACTIONS: ...
16
+ """
17
+
18
+ INFERENCE_INSTRUCTION = f"""
19
+ You are a skilled assistant, interacting with the screen to accomplish the user's goals.
20
+ Here is the action space:
21
+ {ACTION_SPACE}
22
+ Your overall goal is: <goal>(goal)</goal>
23
+ Actions completed at previous steps: <history>(history)</history>
24
+
25
+ The output format should be as follows:
26
+ <think>Analyze step by step based on guidance and screen state to choose the action.</think>
27
+ <answer>The action you finally choose from "action space".</answer>"""
28
+
29
+ ACT2SUM_INSTRUCTION = f"""
30
+ Step-by-step GUI navigation task. Briefly summarize the current action.
31
+ Action space:
32
+ {ACTION_SPACE}
33
+ Goal: <goal>(goal)</goal>
34
+ Current action: <action>(action)</action>
35
+
36
+ Output Format: <summary>One-sentence summary of the action based on the screen image.</summary>"""
37
+ #############################################################################################
38
+ def execute(meta_data):
39
+ goal, hist, img_url = meta_data
40
+ inference_temp = INFERENCE_INSTRUCTION.replace("(goal)", goal).replace("(history)", hist)
41
+ pred = chat_HAR_GUI_3B(img_url, inference_temp)
42
+ return pred
43
+
44
+ def act2sum_fn(meta_data):
45
+ goal, cur_action, img_url = meta_data
46
+ act2sum_temp = ACT2SUM_INSTRUCTION.replace("(goal)", goal).replace("(action)", cur_action)
47
+ pred = chat_HAR_GUI_3B(img_url, act2sum_temp)
48
+ # pred = chat_72B(img_url, act2sum_temp)
49
+ return pred
50
+ #############################################################################################
51
+
52
+
53
+ url = "http://localhost:8000/v1/chat/completions"
54
+ headers = {
55
+ "Content-Type": "application/json"
56
+ }
57
+
58
+ def chat_HAR_GUI_3B(img_url, query):
59
+ content = []
60
+ content.append({"type": "image_url", "image_url": {"url": img_url}})
61
+ content.append({"type": "text", "text": query})
62
+ data = {
63
+ "model": "Qwen2.5-VL-3B-Instruct",
64
+ "messages": [
65
+ {"role": "system", "content": "You are a helpful assistant."},
66
+ {"role": "user", "content": content}
67
+ ],
68
+ "temperature":0}
69
+
70
+ response = requests.post(url, headers=headers, data=json.dumps(data))
71
+ response = response.json()
72
+ response = response['choices'][0]['message']['content']
73
+
74
+ return response
75
+
76
+ def chat_72B(img_url, query):
77
+ content = []
78
+ content.append({"type": "image_url", "image_url": {"url": img_url}})
79
+ content.append({"type": "text", "text": query})
80
+ data = {
81
+ "model": "Qwen2.5-VL-72B-Instruct",
82
+ "messages": [
83
+ {"role": "system", "content": "You are a helpful assistant."},
84
+ {"role": "user", "content": content}
85
+ ],
86
+ "temperature":0}
87
+
88
+ response = requests.post(url, headers=headers, data=json.dumps(data))
89
+ response = response.json()
90
+ response = response['choices'][0]['message']['content']
91
+
92
+ return response
93
+
94
+ ## You can also use its model loading method, such as the following (or use the Swift inference framework for faster speed),
95
+ ##################################################################################
96
+ # from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
97
+ # MAX_IMAGE_PIXELS = 2048*28*28
98
+ # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
99
+ # "./models/HAR-GUI-3B",
100
+ # torch_dtype=torch.bfloat16,
101
+ # attn_implementation="flash_attention_2",
102
+ # device_map="auto"
103
+ # )
104
+ # processor = AutoProcessor.from_pretrained("./HAR-GUI-3B", max_pixels=MAX_IMAGE_PIXELS, padding_side="left")
105
+ ##################################################################################
106
+
107
+ if __name__ == "__main__":
108
+
109
+ folder = "./your_data_folder/"
110
+ episodes = json.load(open(folder + "your_data_path.json", "r"))
111
+ k = 4
112
+ inference_data = []
113
+ for i, episode in tqdm(enumerate(episodes)):
114
+ hist_horizon = []
115
+ for t, step in tqdm(enumerate(episode)):
116
+ cur_hist = ""
117
+ # You can also build ADB pipeline for online execution #https://developer.android.com/tools/adb
118
+ goal, gt_action, img, ep_id = step["goal"], step["ground_truth"], step["image_path"], step["episode_id"]
119
+ img_url = 'http://localhost:6666/' + img
120
+
121
+ if len(hist_horizon) == 0:
122
+ cur_hist = "This is the task's initial state."
123
+ else:
124
+ for i, act2sum_ in enumerate(hist_horizon[-k:]):
125
+ cur_hist += 'Step' + str(i+1) + ': ' + act2sum_ + ".\n"
126
+
127
+ pred = execute((goal, cur_hist, img_url))
128
+ think, pred_action = pred.split("<think>")[-1].split("</think>")[0].strip(), pred.split("<answer>")[-1].split("</answer>")[0].strip()
129
+
130
+ #############
131
+ # act2sum = act2sum_fn((goal, gt_action, img_url)) # Can be used for static inference
132
+ act2sum = act2sum_fn((goal, pred_action, img_url)) # Can be used for online inference
133
+ hist_horizon.append(act2sum.split("<summary>")[-1].split("</summary>")[0])
134
+
135
+ inference_data.append({
136
+ "episode_id": ep_id,
137
+ "image_path": img,
138
+ "goal": goal,
139
+ "pred": pred,
140
+ "history": cur_hist,
141
+ "ground_truth": gt_action
142
+ })
143
+ # evaluate(inference_data)
144
+ with open("your_saving_path.json", "w") as f:
145
+ f.write(json.dumps(inference_data, indent=4))