Jompatron commited on
Commit
e63fa6f
·
1 Parent(s): c5af328

Add dashboard files

Browse files
airquality/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+
airquality/air_quality_data_retrieval.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import Any, Dict, List
3
+ import datetime
4
+ import pandas as pd
5
+ import hopsworks
6
+ from hsfs.feature import Feature
7
+
8
+ def get_historical_data_for_date(date: str, feature_view, weather_fg, model) -> pd.DataFrame:
9
+ """
10
+ Retrieve data for a specific date from a feature view.
11
+
12
+ Args:
13
+ date (str): The date in the format "%Y-%m-%d".
14
+ feature_view: The feature view object.
15
+ model: The machine learning model used for prediction.
16
+
17
+ Returns:
18
+ pd.DataFrame: A DataFrame containing data for the specified date.
19
+ """
20
+ # Convert date string to datetime object
21
+ date_datetime = datetime.datetime.strptime(date, "%Y-%m-%d").date()
22
+
23
+ features_df, labels_df = feature_view.training_data(
24
+ start_time=date_datetime,
25
+ end_time=date_datetime + datetime.timedelta(days=1),
26
+ # event_time=True,
27
+ statistics_config=False
28
+ )
29
+ # bugfix line, shouldn't need to cast to datetime
30
+ features_df['date'] = pd.to_datetime(features_df['date'])
31
+ batch_data = features_df
32
+ batch_data['pm25'] = labels_df['pm25']
33
+ batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
34
+
35
+ return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True)
36
+
37
+
38
+ def get_historical_data_in_date_range(date_start: str, date_end: str, feature_view, weather_fg, model) -> pd.DataFrame:
39
+ """
40
+ Retrieve data for a specific date range from a time in the past from a feature view.
41
+
42
+ Args:
43
+ date_start (str): The start date in the format "%Y-%m-%d".
44
+ date_end (str): The end date in the format "%Y-%m-%d".
45
+ feature_view: The feature view object.
46
+ model: The machine learning model used for prediction.
47
+
48
+ Returns:
49
+ pd.DataFrame: A DataFrame containing data for the specified date range.
50
+ """
51
+ # Convert date strings to datetime objects
52
+ # date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d").date()
53
+ # date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d").date()
54
+
55
+ batch_data = feature_view.query.read()
56
+ batch_data = batch_data[(batch_data['date'] >= date_start) & (batch_data['date'] <= date_end)]
57
+
58
+ batch_data['date'] = batch_data['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
59
+
60
+ return batch_data[['date', 'pm25']].sort_values('date').reset_index(drop=True)
61
+
62
+ def get_future_data_for_date(date: str, feature_view, weather_fg, model) -> pd.DataFrame:
63
+ """
64
+ Predicts future PM2.5 data for a specified date using a given feature view and model.
65
+
66
+ Args:
67
+ date (str): The date in the format "%Y-%m-%d".
68
+ feature_view: The feature view object.
69
+ model: The machine learning model used for prediction.
70
+
71
+ Returns:
72
+ pd.DataFrame: A DataFrame containing data for the specified date.
73
+ """
74
+ date_start_dt = datetime.datetime.strptime(date, "%Y-%m-%d") #.date()
75
+ fg_data = weather_fg.read()
76
+
77
+ # Couldn't get our filters to work, so filter in memory
78
+ df = fg_data[fg_data.date == date_start_dt]
79
+ batch_data = df.drop(['date', 'city'], axis=1)
80
+
81
+ df['pm25'] = model.predict(batch_data)
82
+
83
+ return df[['date', 'pm25']].sort_values('date').reset_index(drop=True)
84
+
85
+
86
+
87
+ def get_future_data_in_date_range(date_start: str, date_end: str, feature_view, weather_fg, model) -> pd.DataFrame:
88
+ """
89
+ Predicts future PM2.5 data for a specified start and end date range using a given feature view and model.
90
+
91
+ Args:
92
+ date_start (str): The start date in the format "%Y-%m-%d".
93
+ date_end (str): The end date in the format "%Y-%m-%d".
94
+ feature_view: The feature view object.
95
+ model: The machine learning model used for prediction.
96
+
97
+ Returns:
98
+ pd.DataFrame: A DataFrame containing data for the specified date range.
99
+ """
100
+ date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d") #.date()
101
+ if date_end == None:
102
+ date_end = date_start
103
+ date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d") #.date()
104
+
105
+ fg_data = weather_fg.read()
106
+ # Fix bug: Cannot compare tz-naive and tz-aware datetime-like objects
107
+ fg_data['date'] = pd.to_datetime(fg_data['date']).dt.tz_localize(None)
108
+
109
+ # Couldn't get our filters to work, so filter in memory
110
+ df = fg_data[(fg_data['date'] >= date_start_dt) & (fg_data['date'] <= date_end_dt)]
111
+ batch_data = df.drop(['date', 'city'], axis=1)
112
+
113
+ df['pm25'] = model.predict(batch_data)
114
+
115
+ return df[['date', 'pm25']].sort_values('date').reset_index(drop=True)
airquality/util.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ import json
7
+ from geopy.geocoders import Nominatim
8
+ import matplotlib.pyplot as plt
9
+ from matplotlib.patches import Patch
10
+ from matplotlib.ticker import MultipleLocator
11
+ import openmeteo_requests
12
+ import requests_cache
13
+ from retry_requests import retry
14
+ import hopsworks
15
+ import hsfs
16
+ from pathlib import Path
17
+
18
+ def get_historical_weather(city, start_date, end_date, latitude, longitude):
19
+ # latitude, longitude = get_city_coordinates(city)
20
+
21
+ # Setup the Open-Meteo API client with cache and retry on error
22
+ cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
23
+ retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
24
+ openmeteo = openmeteo_requests.Client(session = retry_session)
25
+
26
+ # Make sure all required weather variables are listed here
27
+ # The order of variables in hourly or daily is important to assign them correctly below
28
+ url = "https://archive-api.open-meteo.com/v1/archive"
29
+ params = {
30
+ "latitude": latitude,
31
+ "longitude": longitude,
32
+ "start_date": start_date,
33
+ "end_date": end_date,
34
+ "daily": ["temperature_2m_mean", "precipitation_sum", "wind_speed_10m_max", "wind_direction_10m_dominant"]
35
+ }
36
+ responses = openmeteo.weather_api(url, params=params)
37
+
38
+ # Process first location. Add a for-loop for multiple locations or weather models
39
+ response = responses[0]
40
+ print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
41
+ print(f"Elevation {response.Elevation()} m asl")
42
+ print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
43
+ print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
44
+
45
+ # Process daily data. The order of variables needs to be the same as requested.
46
+ daily = response.Daily()
47
+ daily_temperature_2m_mean = daily.Variables(0).ValuesAsNumpy()
48
+ daily_precipitation_sum = daily.Variables(1).ValuesAsNumpy()
49
+ daily_wind_speed_10m_max = daily.Variables(2).ValuesAsNumpy()
50
+ daily_wind_direction_10m_dominant = daily.Variables(3).ValuesAsNumpy()
51
+
52
+ daily_data = {"date": pd.date_range(
53
+ start = pd.to_datetime(daily.Time(), unit = "s"),
54
+ end = pd.to_datetime(daily.TimeEnd(), unit = "s"),
55
+ freq = pd.Timedelta(seconds = daily.Interval()),
56
+ inclusive = "left"
57
+ )}
58
+ daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
59
+ daily_data["precipitation_sum"] = daily_precipitation_sum
60
+ daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
61
+ daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
62
+
63
+ daily_dataframe = pd.DataFrame(data = daily_data)
64
+ daily_dataframe = daily_dataframe.dropna()
65
+ daily_dataframe['city'] = city
66
+ return daily_dataframe
67
+
68
+ def get_hourly_weather_forecast(city, latitude, longitude):
69
+
70
+ # latitude, longitude = get_city_coordinates(city)
71
+
72
+ # Setup the Open-Meteo API client with cache and retry on error
73
+ cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
74
+ retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
75
+ openmeteo = openmeteo_requests.Client(session = retry_session)
76
+
77
+ # Make sure all required weather variables are listed here
78
+ # The order of variables in hourly or daily is important to assign them correctly below
79
+ url = "https://api.open-meteo.com/v1/ecmwf"
80
+ params = {
81
+ "latitude": latitude,
82
+ "longitude": longitude,
83
+ "hourly": ["temperature_2m", "precipitation", "wind_speed_10m", "wind_direction_10m"]
84
+ }
85
+ responses = openmeteo.weather_api(url, params=params)
86
+
87
+ # Process first location. Add a for-loop for multiple locations or weather models
88
+ response = responses[0]
89
+ print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
90
+ print(f"Elevation {response.Elevation()} m asl")
91
+ print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
92
+ print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
93
+
94
+ # Process hourly data. The order of variables needs to be the same as requested.
95
+
96
+ hourly = response.Hourly()
97
+ hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
98
+ hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()
99
+ hourly_wind_speed_10m = hourly.Variables(2).ValuesAsNumpy()
100
+ hourly_wind_direction_10m = hourly.Variables(3).ValuesAsNumpy()
101
+
102
+ hourly_data = {"date": pd.date_range(
103
+ start = pd.to_datetime(hourly.Time(), unit = "s"),
104
+ end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
105
+ freq = pd.Timedelta(seconds = hourly.Interval()),
106
+ inclusive = "left"
107
+ )}
108
+ hourly_data["temperature_2m_mean"] = hourly_temperature_2m
109
+ hourly_data["precipitation_sum"] = hourly_precipitation
110
+ hourly_data["wind_speed_10m_max"] = hourly_wind_speed_10m
111
+ hourly_data["wind_direction_10m_dominant"] = hourly_wind_direction_10m
112
+
113
+ hourly_dataframe = pd.DataFrame(data = hourly_data)
114
+ hourly_dataframe = hourly_dataframe.dropna()
115
+ return hourly_dataframe
116
+
117
+
118
+
119
+ def get_city_coordinates(city_name: str):
120
+ """
121
+ Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
122
+ """
123
+ # Initialize Nominatim API (for getting lat and long of the city)
124
+ geolocator = Nominatim(user_agent="Johannes-MLFS-Lab (jdunkars@kth.se)")
125
+ city = geolocator.geocode(city_name)
126
+
127
+ latitude = round(city.latitude, 2)
128
+ longitude = round(city.longitude, 2)
129
+
130
+ return latitude, longitude
131
+
132
+ def trigger_request(url:str):
133
+ response = requests.get(url)
134
+ if response.status_code == 200:
135
+ # Extract the JSON content from the response
136
+ data = response.json()
137
+ else:
138
+ print("Failed to retrieve data. Status Code:", response.status_code)
139
+ raise requests.exceptions.RequestException(response.status_code)
140
+
141
+ return data
142
+
143
+
144
+ def get_pm25(aqicn_url: str, country: str, city: str, street: str, day: datetime.date, AQI_API_KEY: str):
145
+ """
146
+ Returns DataFrame with air quality (pm25) as dataframe
147
+ """
148
+ # The API endpoint URL
149
+ url = f"{aqicn_url}/?token={AQI_API_KEY}"
150
+
151
+ # Make a GET request to fetch the data from the API
152
+ data = trigger_request(url)
153
+
154
+ # if we get 'Unknown station' response then retry with city in url
155
+ if data['data'] == "Unknown station":
156
+ url1 = f"https://api.waqi.info/feed/{country}/{street}/?token={AQI_API_KEY}"
157
+ data = trigger_request(url1)
158
+
159
+ if data['data'] == "Unknown station":
160
+ url2 = f"https://api.waqi.info/feed/{country}/{city}/{street}/?token={AQI_API_KEY}"
161
+ data = trigger_request(url2)
162
+
163
+
164
+ # Check if the API response contains the data
165
+ if data['status'] == 'ok':
166
+ # Extract the air quality data
167
+ aqi_data = data['data']
168
+ aq_today_df = pd.DataFrame()
169
+ aq_today_df['pm25'] = [aqi_data['iaqi'].get('pm25', {}).get('v', None)]
170
+ aq_today_df['pm25'] = aq_today_df['pm25'].astype('float32')
171
+
172
+ aq_today_df['country'] = country
173
+ aq_today_df['city'] = city
174
+ aq_today_df['street'] = street
175
+ aq_today_df['date'] = day
176
+ aq_today_df['date'] = pd.to_datetime(aq_today_df['date'])
177
+ aq_today_df['url'] = aqicn_url
178
+ else:
179
+ print("Error: There may be an incorrect URL for your Sensor or it is not contactable right now. The API response does not contain data. Error message:", data['data'])
180
+ raise requests.exceptions.RequestException(data['data'])
181
+
182
+ return aq_today_df
183
+
184
+ def get_pm25_test(aqicn_url: str, country: str, city: str, street: str, day: datetime.date, AQI_API_KEY: str):
185
+ """
186
+ Returns DataFrame with air quality (pm25) as dataframe
187
+ """
188
+
189
+ print("▶ Starting get_pm25()")
190
+ print(f"URL base: {aqicn_url}")
191
+ print(f"Country={country}, City={city}, Street={street}, Date={day}")
192
+
193
+ # 1️⃣ First try
194
+ url = f"{aqicn_url}/?token={AQI_API_KEY}"
195
+ print(f"Trying main URL: {url}")
196
+ try:
197
+ data = trigger_request(url)
198
+ print("✔ First request succeeded")
199
+ except Exception as e:
200
+ print("❌ First request failed:", e)
201
+ raise
202
+
203
+ # 2️⃣ Retry with other URLs if “Unknown station”
204
+ if data.get("data") == "Unknown station":
205
+ print("⚠ Unknown station, retrying with country/street...")
206
+ url1 = f"https://api.waqi.info/feed/{country}/{street}/?token={AQI_API_KEY}"
207
+ data = trigger_request(url1)
208
+ print("✔ Second request done")
209
+
210
+ if data.get("data") == "Unknown station":
211
+ print("⚠ Still unknown, retrying with country/city/street...")
212
+ url2 = f"https://api.waqi.info/feed/{country}/{city}/{street}/?token={AQI_API_KEY}"
213
+ data = trigger_request(url2)
214
+ print("✔ Third request done")
215
+
216
+ # 3️⃣ Check result
217
+ if data.get("status") == "ok":
218
+ print("✅ API responded OK")
219
+ aqi_data = data["data"]
220
+ aq_today_df = pd.DataFrame()
221
+ aq_today_df["pm25"] = [aqi_data["iaqi"].get("pm25", {}).get("v", None)]
222
+ aq_today_df["pm25"] = aq_today_df["pm25"].astype("float32")
223
+ aq_today_df["country"] = country
224
+ aq_today_df["city"] = city
225
+ aq_today_df["street"] = street
226
+ aq_today_df["date"] = pd.to_datetime(day)
227
+ aq_today_df["url"] = aqicn_url
228
+ print("✅ DataFrame created successfully")
229
+ return aq_today_df
230
+ else:
231
+ print("❌ Error: API response invalid or no data.")
232
+ print("Response content:", data)
233
+ raise requests.exceptions.RequestException(data.get("data"))
234
+
235
+
236
+
237
+ def plot_air_quality_forecast(city: str, street: str, df: pd.DataFrame, file_path: str, hindcast=False):
238
+ fig, ax = plt.subplots(figsize=(10, 6))
239
+
240
+ day = pd.to_datetime(df['date']).dt.date
241
+ # Plot each column separately in matplotlib
242
+ ax.plot(day, df['predicted_pm25'], label='Predicted PM2.5', color='red', linewidth=2, marker='o', markersize=5, markerfacecolor='blue')
243
+
244
+ # Set the y-axis to a logarithmic scale
245
+ ax.set_yscale('log')
246
+ ax.set_yticks([0, 10, 25, 50, 100, 250, 500])
247
+ ax.get_yaxis().set_major_formatter(plt.ScalarFormatter())
248
+ ax.set_ylim(bottom=1)
249
+
250
+ # Set the labels and title
251
+ ax.set_xlabel('Date')
252
+ ax.set_title(f"PM2.5 Predicted (Logarithmic Scale) for {city}, {street}")
253
+ ax.set_ylabel('PM2.5')
254
+
255
+ colors = ['green', 'yellow', 'orange', 'red', 'purple', 'darkred']
256
+ labels = ['Good', 'Moderate', 'Unhealthy for Some', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
257
+ ranges = [(0, 49), (50, 99), (100, 149), (150, 199), (200, 299), (300, 500)]
258
+ for color, (start, end) in zip(colors, ranges):
259
+ ax.axhspan(start, end, color=color, alpha=0.3)
260
+
261
+ # Add a legend for the different Air Quality Categories
262
+ patches = [Patch(color=colors[i], label=f"{labels[i]}: {ranges[i][0]}-{ranges[i][1]}") for i in range(len(colors))]
263
+ legend1 = ax.legend(handles=patches, loc='upper right', title="Air Quality Categories", fontsize='x-small')
264
+
265
+ # Aim for ~10 annotated values on x-axis, will work for both forecasts ans hindcasts
266
+ if len(df.index) > 11:
267
+ every_x_tick = len(df.index) / 10
268
+ ax.xaxis.set_major_locator(MultipleLocator(every_x_tick))
269
+
270
+ plt.xticks(rotation=45)
271
+
272
+ if hindcast == True:
273
+ ax.plot(day, df['pm25'], label='Actual PM2.5', color='black', linewidth=2, marker='^', markersize=5, markerfacecolor='grey')
274
+ legend2 = ax.legend(loc='upper left', fontsize='x-small')
275
+ ax.add_artist(legend1)
276
+
277
+ # Ensure everything is laid out neatly
278
+ plt.tight_layout()
279
+
280
+ # # Save the figure, overwriting any existing file with the same name
281
+ plt.savefig(file_path)
282
+ return plt
283
+
284
+
285
+ def delete_feature_groups(fs, name):
286
+ try:
287
+ for fg in fs.get_feature_groups(name):
288
+ fg.delete()
289
+ print(f"Deleted {fg.name}/{fg.version}")
290
+ except hsfs.client.exceptions.RestAPIError:
291
+ print(f"No {name} feature group found")
292
+
293
+ def delete_feature_views(fs, name):
294
+ try:
295
+ for fv in fs.get_feature_views(name):
296
+ fv.delete()
297
+ print(f"Deleted {fv.name}/{fv.version}")
298
+ except hsfs.client.exceptions.RestAPIError:
299
+ print(f"No {name} feature view found")
300
+
301
+ def delete_models(mr, name):
302
+ models = mr.get_models(name)
303
+ if not models:
304
+ print(f"No {name} model found")
305
+ for model in models:
306
+ model.delete()
307
+ print(f"Deleted model {model.name}/{model.version}")
308
+
309
+ def delete_secrets(proj, name):
310
+ secrets = secrets_api(proj.name)
311
+ try:
312
+ secret = secrets.get_secret(name)
313
+ secret.delete()
314
+ print(f"Deleted secret {name}")
315
+ except hopsworks.client.exceptions.RestAPIError:
316
+ print(f"No {name} secret found")
317
+
318
+ # WARNING - this will wipe out all your feature data and models
319
+ def purge_project(proj):
320
+ fs = proj.get_feature_store()
321
+ mr = proj.get_model_registry()
322
+
323
+ # Delete Feature Views before deleting the feature groups
324
+ delete_feature_views(fs, "air_quality_fv")
325
+
326
+ # Delete ALL Feature Groups
327
+ delete_feature_groups(fs, "air_quality")
328
+ delete_feature_groups(fs, "weather")
329
+ delete_feature_groups(fs, "aq_predictions")
330
+
331
+ # Delete all Models
332
+ delete_models(mr, "air_quality_xgboost_model")
333
+ delete_secrets(proj, "SENSOR_LOCATION_JSON")
334
+
335
+ def check_file_path(file_path):
336
+ my_file = Path(file_path)
337
+ if my_file.is_file() == False:
338
+ print(f"Error. File not found at the path: {file_path} ")
339
+ else:
340
+ print(f"File successfully found at the path: {file_path}")
341
+
342
+ def backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, model):
343
+ features_df = weather_fg.read()
344
+ features_df = features_df.sort_values(by=['date'], ascending=True)
345
+ features_df = features_df.tail(10)
346
+ features_df['predicted_pm25'] = model.predict(features_df[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
347
+ df = pd.merge(features_df, air_quality_df[['date','pm25','street','country']], on="date")
348
+ df['days_before_forecast_day'] = 1
349
+ hindcast_df = df
350
+ df = df.drop('pm25', axis=1)
351
+ monitor_fg.insert(df, write_options={"wait_for_job": True})
352
+ return hindcast_df
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ import hopsworks
5
+ from xgboost import XGBRegressor
6
+ import joblib
7
+ from openai import OpenAI
8
+ from functions.llm_chain import (
9
+ load_model,
10
+ get_llm_chain,
11
+ generate_response,
12
+ generate_response_openai,
13
+ )
14
+ # Initialize the ASR pipeline
15
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
16
+
17
+ def connect_to_hopsworks():
18
+ # Initialize Hopsworks feature store connection
19
+ project = hopsworks.login()
20
+ fs = project.get_feature_store()
21
+
22
+ # Retrieve the model registry
23
+ mr = project.get_model_registry()
24
+
25
+ # Retrieve the 'air_quality_fv' feature view
26
+ feature_view = fs.get_feature_view(
27
+ name="air_quality_fv",
28
+ version=1,
29
+ )
30
+
31
+ # Initialize batch scoring
32
+ feature_view.init_batch_scoring(1)
33
+
34
+ # Retrieve the 'air_quality_xgboost_model' from the model registry
35
+ retrieved_model = mr.get_model(name="air_quality_xgboost_model", version=1)
36
+
37
+ # Download the saved model artifacts to a local directory
38
+ saved_model_dir = retrieved_model.download()
39
+
40
+ # Load the XGBoost regressor model and label encoder from the saved model directory
41
+ # model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
42
+ # Loading the XGBoost regressor model and label encoder from the saved model directory
43
+ # retrieved_xgboost_model = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
44
+ model_air_quality = XGBRegressor()
45
+
46
+ model_air_quality.load_model(saved_model_dir + "/model.json")
47
+
48
+ return feature_view, model_air_quality
49
+
50
+
51
+ def retrieve_llm_chain():
52
+ model_llm, tokenizer = load_model()
53
+ llm_chain = get_llm_chain(
54
+ model_llm,
55
+ tokenizer,
56
+ )
57
+ return model_llm, tokenizer, llm_chain
58
+
59
+ # Setup the models and feature view
60
+ feature_view, model_air_quality = connect_to_hopsworks()
61
+
62
+ def transcribe(audio):
63
+ sr, y = audio
64
+ y = y.astype(np.float32)
65
+ if y.ndim > 1 and y.shape[1] > 1:
66
+ y = np.mean(y, axis=1)
67
+ y /= np.max(np.abs(y))
68
+ return transcriber({"sampling_rate": sr, "raw": y})["text"]
69
+
70
+
71
+ def generate_query_response(user_query, method, openai_api_key=None):
72
+ if method == 'Hermes LLM':
73
+ # Load the LLM and its corresponding tokenizer and configure a language model chain
74
+ model_llm, tokenizer, llm_chain = retrieve_llm_chain()
75
+
76
+ response = generate_response(
77
+ user_query,
78
+ feature_view,
79
+ model_air_quality,
80
+ model_llm,
81
+ tokenizer,
82
+ llm_chain,
83
+ verbose=False,
84
+ )
85
+ return response
86
+
87
+ elif method == 'OpenAI API' and openai_api_key:
88
+ client = OpenAI(
89
+ api_key=openai_api_key
90
+ )
91
+
92
+ response = generate_response_openai(
93
+ user_query,
94
+ feature_view,
95
+ model_air_quality,
96
+ client,
97
+ verbose=False,
98
+ )
99
+ return response
100
+
101
+ else:
102
+ return "Invalid method or missing API key."
103
+
104
+
105
+ def handle_input(text_input=None, audio_input=None, method='Hermes LLM', openai_api_key=""):
106
+ if audio_input is not None:
107
+ user_query = transcribe(audio_input)
108
+ else:
109
+ user_query = text_input
110
+
111
+ # Check if OpenAI API key is required but not provided
112
+ if method == 'OpenAI API' and not openai_api_key.strip():
113
+ return "OpenAI API key is required for this method."
114
+
115
+ if user_query:
116
+ return generate_query_response(user_query, method, openai_api_key)
117
+ else:
118
+ return "Please provide input either via text or voice."
119
+
120
+ # Setting up the Gradio Interface
121
+ iface = gr.Interface(
122
+ fn=handle_input,
123
+ inputs=[
124
+ gr.Textbox(placeholder="Type here or use voice input..."),
125
+ gr.Audio(),
126
+ gr.Radio(["Hermes LLM", "OpenAI API"], label="Choose the response generation method"),
127
+ gr.Textbox(label="Enter your OpenAI API key (only if you selected OpenAI API):", type="password") # Removed `optional=True`
128
+ ],
129
+ outputs="text",
130
+ title="🌤️ AirQuality AI Assistant 💬",
131
+ description="Ask your questions about air quality or use your voice to interact. Select the response generation method and provide an OpenAI API key if necessary."
132
+ )
133
+
134
+ iface.launch(share=True)
requirements.txt.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hopsworks[python,great-expectations]
2
+ streamlit==1.28.2
3
+ email-validator==2.2.0
4
+ pydantic-settings>=2.6.1
5
+ geopy==2.4.1
6
+ openmeteo-requests
7
+ requests-cache==1.2.0
8
+ retry-requests==2.0.0
9
+ xgboost==2.0.3
10
+ scikit-learn==1.2.2
11
+ matplotlib==3.8.3
12
+ plotly
13
+ seaborn
14
+ nbformat
15
+ Faker
16
+ invoke
17
+ python-dotenv
18
+ #feldera==0.41