hasannyr commited on
Commit
177875a
·
1 Parent(s): 9d0ef93

Added rl-based test dashboard

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
test/.DS_Store ADDED
Binary file (6.15 kB). View file
 
test/all_load_mpa_cpu_and_performance_without_average.csv ADDED
The diff for this file is too large to render. See raw diff
 
test/dashboard.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import solara
2
+ import pandas as pd
3
+ import os
4
+ import zipfile
5
+ from ray.tune.registry import register_env
6
+ from env import Teastore
7
+ from ray.rllib.algorithms import ppo, sac, dqn
8
+ from solara.components.file_drop import FileInfo
9
+ import time
10
+
11
+ test_plot_data = solara.reactive({'step': [], 'replica': [], 'cpu': [], "load": [],
12
+ "num_request": [], "response_time": []})
13
+ uploaded_algo = solara.reactive(None)
14
+ error_state = solara.reactive(None)
15
+ number_of_steps = solara.reactive(10)
16
+ available_checkpoint_names = solara.reactive([])
17
+ selected_checkpoint_name = solara.reactive(None)
18
+ uploaded_algo_status = solara.reactive(False)
19
+
20
+
21
+
22
+ @solara.component
23
+ def status_plot(data):
24
+ options_replica = {
25
+ "xAxis": {
26
+ "type": "category",
27
+ "data": data["step"],
28
+ },
29
+ "yAxis": {
30
+ "type": "value",
31
+ },
32
+ "series": [
33
+ {
34
+ "name": "Replica",
35
+ "data": data['replica'],
36
+ "type": 'line'
37
+ },
38
+ ],
39
+ "title": {
40
+ "text": 'Replica Number',
41
+ "left": "center"
42
+ },
43
+ "legend": {
44
+ "orient": 'vertical',
45
+ "right": 0,
46
+ # "top": 50,
47
+ # "bottom": 50,
48
+ "data": ["Replica"]
49
+ },
50
+
51
+
52
+ }
53
+
54
+ options_cpu= {
55
+ "xAxis": {
56
+ "type": "category",
57
+ "data": data["step"],
58
+ },
59
+ "yAxis": {
60
+ "type": "value",
61
+ },
62
+ "series": [
63
+ {
64
+ "name": "CPU",
65
+ "data": data['cpu'],
66
+ "type": 'line'
67
+ },
68
+ ],
69
+ "title": {
70
+ "text": 'CPU Limit',
71
+ "left": "center"
72
+ },
73
+ "legend": {
74
+ "orient": 'vertical',
75
+ "right": 0,
76
+ # "top": 50,
77
+ # "bottom": 50,
78
+ "data": ["CPU"]
79
+ },
80
+ }
81
+ options_load= {
82
+ "xAxis": {
83
+ "type": "category",
84
+ "data": data["step"],
85
+ },
86
+ "yAxis": {
87
+ "type": "value",
88
+ },
89
+ "series": [
90
+ {
91
+ "name": "Processed req",
92
+ "data": data['num_request'],
93
+ "type": 'line'
94
+ },
95
+ {
96
+ "name": "Load",
97
+ "data": data['load'],
98
+ "type": 'line'
99
+ },
100
+
101
+ ],
102
+ "title": {
103
+ "text": 'Number of Request (Tps) and Load (Tps)',
104
+ "left": "center"
105
+ },
106
+ "legend": {
107
+ "orient": 'vertical',
108
+ "right": 0,
109
+ # "top": 50,
110
+ # "bottom": 50,
111
+ "data": ["Processed req", "Load"]
112
+ },
113
+ }
114
+ options_response_time= {
115
+ "xAxis": {
116
+ "type": "category",
117
+ "data": data["step"],
118
+ },
119
+ "yAxis": {
120
+ "type": "value",
121
+ },
122
+ "series": [
123
+ {
124
+ "name": "Response time",
125
+ "data": data['response_time'],
126
+ "type": 'line'
127
+ },
128
+ ],
129
+ "title": {
130
+ "text": 'Response time (ms)',
131
+ "left": "center"
132
+ },
133
+ "legend": {
134
+ "orient": 'vertical',
135
+ "right": 0,
136
+ # "top": 50,
137
+ # "bottom": 50,
138
+ "data": ["Response time"]
139
+ },
140
+ }
141
+
142
+
143
+
144
+ with solara.GridFixed(columns=1):
145
+ # with solara.Column():
146
+ solara.FigureEcharts(option=options_replica)
147
+ solara.FigureEcharts(option=options_cpu)
148
+
149
+
150
+ solara.FigureEcharts(option=options_load)
151
+ solara.FigureEcharts(option=options_response_time)
152
+
153
+
154
+
155
+
156
+ @solara.component
157
+ def CheckpointDrop():
158
+ zip_content, set_zip_content = solara.use_state("")
159
+ content, set_content = solara.use_state(b"")
160
+ filename, set_filename = solara.use_state("")
161
+ size, set_size = solara.use_state(0)
162
+ extract_path, set_extract_path = solara.use_state("")
163
+
164
+
165
+ def on_file(f: FileInfo):
166
+ set_filename(f["name"])
167
+ set_size(f["size"])
168
+ temp_path = os.path.join(f["name"])
169
+ with open(temp_path, "wb") as temp_file:
170
+ temp_file.write(f["file_obj"].read())
171
+
172
+ extracted_folder = os.path.join("extracted_files", os.path.splitext(f["name"])[0])
173
+ with zipfile.ZipFile(temp_path, 'r') as zip_ref:
174
+ zip_ref.extractall(extracted_folder)
175
+
176
+ set_extract_path(extracted_folder)
177
+ extracted_files = os.listdir(extracted_folder)
178
+ set_zip_content("\n".join(extracted_files))
179
+
180
+ # find the names of the checkpoint folders
181
+ #existing_names = available_checkpoint_names.value
182
+ # updated_names = set(new_names + existing_names)
183
+ available_checkpoint_names.set(['denee'])
184
+
185
+ os.remove(temp_path)
186
+
187
+
188
+
189
+ solara.FileDrop(
190
+ label="Drag and drop a file here",
191
+ on_file=on_file,
192
+ lazy=True, # We will only read the first 100 bytes
193
+ )
194
+
195
+ @solara.component
196
+ def ListAvailableCheckpoints():
197
+ list_subfolders_names = [f.name for f in os.scandir("extracted_files") if f.is_dir()]
198
+ available_checkpoint_names.set(list_subfolders_names)
199
+
200
+
201
+ def load_agent():
202
+ if selected_checkpoint_name.value is None:
203
+ return None
204
+
205
+ register_env("teastore", lambda config: Teastore())
206
+
207
+ config_dqn = (
208
+ dqn.DQNConfig()
209
+ .environment(env="teastore")
210
+ .rollouts(num_rollout_workers=1, enable_connectors=False, num_envs_per_worker=1)
211
+ .resources(num_gpus=0, num_cpus_per_worker=1)
212
+ .training(train_batch_size=256, model={"fcnet_hiddens": [32, 32]})
213
+ )
214
+ algo = config_dqn.build()
215
+ checkpoint_dir = selected_checkpoint_name.value
216
+ checkpoint_path = os.path.join("extracted_files", checkpoint_dir)
217
+ algo.restore(checkpoint_path)
218
+
219
+
220
+
221
+ return algo
222
+
223
+ def start_test():
224
+
225
+ env = Teastore()
226
+ obs, info = env.reset()
227
+ done = False
228
+ truncated = False
229
+ sum_reward = 0
230
+ step_list = []
231
+ replica_array = []
232
+ cpu_array = []
233
+ num_request_array = []
234
+ load_array = []
235
+ response_time_array = []
236
+
237
+
238
+
239
+ for i in range(number_of_steps.value):
240
+ step_list.append(i)
241
+ replica_array.append(obs[0])
242
+ cpu_array.append(obs[1])
243
+ load_array.append(env.load)
244
+ response_time_array.append(env.response_time)
245
+ num_request_array.append(env.num_request)
246
+
247
+ action = uploaded_algo.value.compute_single_action(obs, explore=False)
248
+ next_state, reward, _, _, _ = env.step(action)
249
+ obs = next_state
250
+
251
+
252
+ test_plot_data.set(
253
+ {'step':step_list.copy(),
254
+ 'replica': replica_array.copy(),
255
+ 'cpu': cpu_array.copy(),
256
+ "load": load_array.copy(),
257
+ "response_time": response_time_array.copy(),
258
+ "num_request": num_request_array.copy()
259
+ })
260
+ # time.sleep(2)
261
+
262
+ def load_test():
263
+ algo = load_agent()
264
+ if algo is None:
265
+ error_state.set('Couldnt load checkpoint')
266
+ else:
267
+ uploaded_algo_status.set(True)
268
+ uploaded_algo.set(algo)
269
+
270
+
271
+ @solara.component
272
+ def Page():
273
+
274
+
275
+ with solara.Sidebar():
276
+ if error_state.value is not None:
277
+ solara.Error(label=f'{error_state.value}')
278
+ # CheckpointDrop()
279
+ ListAvailableCheckpoints()
280
+ solara.Select(label="Choose checkpoint", values=available_checkpoint_names.value, value=selected_checkpoint_name.value,
281
+ on_value=selected_checkpoint_name.set
282
+ )
283
+ solara.Button(label="Run test", on_click=start_test, disabled=True if uploaded_algo.value is None else False)
284
+ solara.Button(label="Load agent", on_click=load_test, disabled=True if selected_checkpoint_name.value is None else False)
285
+ if uploaded_algo_status.value == False:
286
+ solara.Info("Agent is not uploaded")
287
+ else:
288
+ solara.Info("Agent is ready for test")
289
+ solara.SliderInt(label="choose number of steps", min=1, max=500, value=number_of_steps)
290
+
291
+
292
+ status_plot(test_plot_data.value)
293
+
294
+
295
+
296
+
test/env.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### No heap in state
2
+
3
+ from ray.rllib.env.policy_client import PolicyClient
4
+ import pandas as pd
5
+ from prometheus_api_client import PrometheusConnect
6
+ from kubernetes import client, config
7
+ import time
8
+ import numpy as np
9
+ from collections import OrderedDict
10
+ from gymnasium.spaces import Discrete, Dict, MultiDiscrete, Tuple, Box
11
+ import ssl
12
+ import random
13
+ import logging
14
+ ssl._create_default_https_context = ssl._create_unverified_context
15
+ from itertools import product
16
+ import time
17
+ import gymnasium as gym
18
+ import math
19
+
20
+
21
+
22
+ class Teastore(gym.Env):
23
+ DATA_PATH = "./all_load_mpa_cpu_and_performance_without_average.csv"
24
+ MAX_STEPS = 500
25
+
26
+
27
+ def __init__(self) -> None:
28
+ self.data = pd.read_csv(self.DATA_PATH)
29
+ # drop_rows = (df["cpu_usage"] != 0) | (df["memory_usage"] != 0)
30
+ # self.data = df[drop_rows].reset_index(drop=True)
31
+ self.action_space = Discrete(5)
32
+ self.observation_space = Box(low=np.array([1, 4, 0, 0]), high=np.array([3, 9, 1000,1000]), dtype=np.float32)
33
+ self.count = 0
34
+ self.info = {}
35
+ self.previous_tps = 0
36
+ self.idx = 0
37
+ self.up = None
38
+ self.load = 0
39
+ self.response_time = 0
40
+ self.num_request = 0
41
+
42
+
43
+
44
+
45
+ def find_next_state(self, target, expected_tps):
46
+ if expected_tps == 144:
47
+ self.up = False
48
+ elif expected_tps == 24:
49
+ self.up = True
50
+
51
+ if self.up == True:
52
+ new_expected_tps = expected_tps + 24
53
+ elif self.up == False:
54
+ new_expected_tps = expected_tps - 24
55
+
56
+ new_previous_tps = expected_tps
57
+ # new_expected_tps = 48
58
+ # new_previous_tps = 24
59
+ next = np.concatenate([target, [new_previous_tps, new_expected_tps]])
60
+ equal_rows = np.all(self.data.loc[:, ["replica", "cpu", "previous_tps", "expected_tps"]].values == next, axis=1)
61
+ matched_indexes = np.where(equal_rows)[0]
62
+ return matched_indexes.tolist(), new_previous_tps, new_expected_tps
63
+
64
+
65
+
66
+ def reset(self, *, seed=None, options=None):
67
+ self.idx = random.randint(0, len(self.data)-1)
68
+ self.state = np.array(self.data.loc[self.idx, ["replica", "cpu", 'previous_tps', "expected_tps"]])
69
+ # self.state = np.array([3,9,24,48])
70
+ self.previous_tps = self.state[2]
71
+ self.truncated = False
72
+ self.terminated = False
73
+ self.reward = 0
74
+ self.count = 0
75
+ self.info = {}
76
+ self.up = True if self.state[3] - self.state[2] > 0 else False
77
+ self.load = self.state[-1]
78
+ self.response_time = self.data.loc[self.idx, "response_time"]
79
+ self.num_request = self.data.loc[self.idx, "num_request"]
80
+ return self.state, self.info
81
+
82
+ def step(self, action):
83
+ selected_row_idx = 0
84
+ self.count += 1
85
+
86
+ if action == 0:
87
+ temp_state = self.state[0:2] + np.array([0, 0])
88
+ elif action == 1: # increase_replica
89
+ temp_state = self.state[0:2] + np.array([1, 0])
90
+ elif action == 2: # decrease_replica
91
+ temp_state = self.state[0:2] + np.array([-1, 0])
92
+ elif action == 3:
93
+ temp_state = self.state[0:2] + np.array([0, 1])
94
+ else:
95
+ temp_state = self.state[0:2] + np.array([0 , -1])
96
+
97
+
98
+
99
+ idx, new_previous_tps, new_expected_tps = self.find_next_state(temp_state, self.state[3])
100
+
101
+ if idx:
102
+ selected_row_idx = random.choice(idx)
103
+ selected_data = self.data.iloc[selected_row_idx]
104
+ self.state = np.array(selected_data[["replica", "cpu", 'previous_tps',"expected_tps"]])
105
+ self.reward = selected_data['reward']
106
+ # self.reward = 1
107
+ # print(f"state: {self.state} - previous_tps: {self.previous_tps}")
108
+ self.previous_tps = selected_data["expected_tps"]
109
+ self.num_request = self.data.loc[selected_row_idx, "num_request"]
110
+ self.response_time = self.data.loc[selected_row_idx, "response_time"]
111
+
112
+ else:
113
+ self.state[2] = new_previous_tps
114
+ self.state[3] = new_expected_tps
115
+ self.previous_tps = new_expected_tps
116
+ self.reward = -5
117
+ self.num_request = 0
118
+ self.response_time = 200
119
+
120
+ self.load = self.state[-1]
121
+ # self.response_time = 20
122
+ # self.num_request = 20
123
+
124
+ self.terminated = (self.count >= self.MAX_STEPS)
125
+ self.truncated = self.terminated
126
+ return self.state, self.reward, self.terminated, self.truncated, self.info
127
+
128
+
129
+
130
+
131
+
132
+
133
+
test/extracted_files/.DS_Store ADDED
Binary file (6.15 kB). View file
 
test/extracted_files/case5/.is_checkpoint ADDED
File without changes
test/extracted_files/case5/.tune_metadata ADDED
Binary file (11.1 kB). View file
 
test/extracted_files/case5/algorithm_state.pkl ADDED
Binary file (6.57 kB). View file
 
test/extracted_files/case5/policies/default_policy/policy_state.pkl ADDED
Binary file (244 kB). View file
 
test/extracted_files/case5/policies/default_policy/rllib_checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "Policy", "checkpoint_version": "1.1", "format": "cloudpickle", "state_file": "policy_state.pkl", "ray_version": "2.4.0", "ray_commit": "4479f66d4db967d3c9dd0af2572061276ba926ba"}
test/extracted_files/case5/rllib_checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "Algorithm", "checkpoint_version": "1.1", "format": "cloudpickle", "state_file": "/Users/hasan.nayir/ray_results/DQN_teastore_2024-03-27_13-22-15jpy5kzub/checkpoint_010000/algorithm_state.pkl", "policy_ids": ["default_policy"], "ray_version": "2.4.0", "ray_commit": "4479f66d4db967d3c9dd0af2572061276ba926ba"}
test/extracted_files/case6/.is_checkpoint ADDED
File without changes
test/extracted_files/case6/.tune_metadata ADDED
Binary file (11.1 kB). View file
 
test/extracted_files/case6/algorithm_state.pkl ADDED
Binary file (6.57 kB). View file
 
test/extracted_files/case6/policies/default_policy/policy_state.pkl ADDED
Binary file (244 kB). View file
 
test/extracted_files/case6/policies/default_policy/rllib_checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "Policy", "checkpoint_version": "1.1", "format": "cloudpickle", "state_file": "policy_state.pkl", "ray_version": "2.4.0", "ray_commit": "4479f66d4db967d3c9dd0af2572061276ba926ba"}
test/extracted_files/case6/rllib_checkpoint.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"type": "Algorithm", "checkpoint_version": "1.1", "format": "cloudpickle", "state_file": "/Users/hasan.nayir/ray_results/DQN_teastore_2024-03-29_16-52-25n5fvwg_p/checkpoint_010000/algorithm_state.pkl", "policy_ids": ["default_policy"], "ray_version": "2.4.0", "ray_commit": "4479f66d4db967d3c9dd0af2572061276ba926ba"}