|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.optim as optim |
|
|
import torch.nn.functional as F |
|
|
from torch.utils.data import Dataset, DataLoader |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import requests |
|
|
import json |
|
|
from datetime import datetime, timedelta |
|
|
import warnings |
|
|
import time |
|
|
import os |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
import textwrap |
|
|
|
|
|
|
|
|
try: |
|
|
from huggingface_hub import HfApi, HfFolder, create_repo, whoami |
|
|
import gradio as gr |
|
|
HUGGINGFACE_LIBS_INSTALLED = True |
|
|
except ImportError: |
|
|
HUGGINGFACE_LIBS_INSTALLED = False |
|
|
|
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
print(f"Using device: {device}") |
|
|
if torch.cuda.is_available(): |
|
|
print(f"GPU: {torch.cuda.get_device_name(0)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RealAirQualityCollector: |
|
|
"""Collects REAL air pollution data from multiple verified sources""" |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
self.owm_api_key = "demo_key" |
|
|
self.owm_base_url = "http://api.openweathermap.org/data/2.5/air_pollution" |
|
|
|
|
|
|
|
|
self.who_database_url = "https://cdn.who.int/media/docs/default-source/air-pollution-documents/air-quality-and-health/who_database_2024.xlsx" |
|
|
|
|
|
|
|
|
self.epa_base_url = "https://aqs.epa.gov/aqsweb/airdata" |
|
|
|
|
|
def get_openweathermap_data(self, lat, lon, api_key): |
|
|
"""Get air pollution data from OpenWeatherMap API""" |
|
|
if api_key == "demo_key": |
|
|
print("π Please get a FREE API key from OpenWeatherMap:") |
|
|
print(" 1. Visit: https://openweathermap.org/api/air-pollution") |
|
|
print(" 2. Sign up for free account") |
|
|
print(" 3. Get API key (1M calls/month free)") |
|
|
return [] |
|
|
|
|
|
try: |
|
|
|
|
|
current_url = f"{self.owm_base_url}?lat={lat}&lon={lon}&appid={api_key}" |
|
|
response = requests.get(current_url) |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
return self._parse_owm_data(data, lat, lon) |
|
|
else: |
|
|
print(f"OpenWeatherMap API error: {response.status_code}") |
|
|
return [] |
|
|
except Exception as e: |
|
|
print(f"Error fetching OpenWeatherMap data: {e}") |
|
|
return [] |
|
|
|
|
|
def get_openweathermap_historical(self, lat, lon, start_timestamp, end_timestamp, api_key): |
|
|
"""Get historical air pollution data from OpenWeatherMap""" |
|
|
if api_key == "demo_key": |
|
|
return [] |
|
|
|
|
|
try: |
|
|
hist_url = f"{self.owm_base_url}/history?lat={lat}&lon={lon}&start={start_timestamp}&end={end_timestamp}&appid={api_key}" |
|
|
response = requests.get(hist_url) |
|
|
|
|
|
if response.status_code == 200: |
|
|
data = response.json() |
|
|
return self._parse_owm_historical_data(data, lat, lon) |
|
|
else: |
|
|
print(f"Historical API error: {response.status_code}") |
|
|
return [] |
|
|
except Exception as e: |
|
|
print(f"Error fetching historical data: {e}") |
|
|
return [] |
|
|
|
|
|
def _parse_owm_data(self, data, lat, lon): |
|
|
"""Parse OpenWeatherMap current data""" |
|
|
results = [] |
|
|
if 'list' in data: |
|
|
for item in data['list']: |
|
|
components = item['components'] |
|
|
timestamp = datetime.fromtimestamp(item['dt']) |
|
|
|
|
|
for pollutant, value in components.items(): |
|
|
results.append({ |
|
|
'datetime': timestamp, |
|
|
'latitude': lat, |
|
|
'longitude': lon, |
|
|
'parameter': pollutant, |
|
|
'value': value, |
|
|
'unit': 'Β΅g/mΒ³' if pollutant in ['pm2_5', 'pm10'] else 'Β΅g/mΒ³', |
|
|
'source': 'OpenWeatherMap' |
|
|
}) |
|
|
return results |
|
|
|
|
|
def _parse_owm_historical_data(self, data, lat, lon): |
|
|
"""Parse OpenWeatherMap historical data""" |
|
|
results = [] |
|
|
if 'list' in data: |
|
|
for item in data['list']: |
|
|
components = item['components'] |
|
|
timestamp = datetime.fromtimestamp(item['dt']) |
|
|
|
|
|
for pollutant, value in components.items(): |
|
|
results.append({ |
|
|
'datetime': timestamp, |
|
|
'latitude': lat, |
|
|
'longitude': lon, |
|
|
'parameter': pollutant, |
|
|
'value': value, |
|
|
'unit': 'Β΅g/mΒ³', |
|
|
'source': 'OpenWeatherMap' |
|
|
}) |
|
|
return results |
|
|
|
|
|
def download_who_database(self): |
|
|
"""Download WHO Ambient Air Quality Database""" |
|
|
try: |
|
|
print("π Downloading WHO Air Quality Database (V6.1 - 7,182 cities)...") |
|
|
|
|
|
|
|
|
who_urls = [ |
|
|
"https://cdn.who.int/media/docs/default-source/air-pollution-documents/air-quality-and-health/who_database_2024.xlsx", |
|
|
"https://www.who.int/docs/default-source/air-pollution/air-quality-and-health/who_aaq_database_2024_v6_1.xlsx", |
|
|
"https://cdn.who.int/media/docs/default-source/air-pollution-documents/air-quality-and-health/who_aaq_database_2024_v6_1.xlsx" |
|
|
] |
|
|
|
|
|
for url in who_urls: |
|
|
try: |
|
|
response = requests.get(url, timeout=30) |
|
|
if response.status_code == 200: |
|
|
print(f"β
Successfully downloaded WHO database from: {url}") |
|
|
|
|
|
with open('who_air_quality_2024.xlsx', 'wb') as f: |
|
|
f.write(response.content) |
|
|
|
|
|
|
|
|
df = pd.read_excel('who_air_quality_2024.xlsx', sheet_name=0) |
|
|
print(f"π WHO Database: {len(df)} records loaded") |
|
|
return df |
|
|
except Exception as e: |
|
|
print(f"Failed to download from {url}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
print("π WHO database download failed. Creating sample with real city coordinates...") |
|
|
return self._create_representative_sample() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"WHO database error: {e}") |
|
|
return self._create_representative_sample() |
|
|
|
|
|
def download_epa_data(self, year=2023): |
|
|
"""Download EPA AirData files""" |
|
|
try: |
|
|
print(f"π Downloading EPA AirData for {year}...") |
|
|
|
|
|
|
|
|
epa_url = f"{self.epa_base_url}/daily_88101_{year}.zip" |
|
|
response = requests.get(epa_url, timeout=30) |
|
|
|
|
|
if response.status_code == 200: |
|
|
|
|
|
with open(f'epa_pm25_{year}.zip', 'wb') as f: |
|
|
f.write(response.content) |
|
|
|
|
|
|
|
|
import zipfile |
|
|
with zipfile.ZipFile(f'epa_pm25_{year}.zip', 'r') as zip_ref: |
|
|
zip_ref.extractall() |
|
|
|
|
|
|
|
|
csv_file = f'daily_88101_{year}.csv' |
|
|
df = pd.read_csv(csv_file) |
|
|
print(f"β
EPA data: {len(df)} records loaded") |
|
|
return df |
|
|
else: |
|
|
print(f"EPA download failed: {response.status_code}") |
|
|
return None |
|
|
|
|
|
except Exception as e: |
|
|
print(f"EPA data error: {e}") |
|
|
return None |
|
|
|
|
|
def _create_representative_sample(self): |
|
|
"""Create representative sample data with real city coordinates""" |
|
|
|
|
|
cities_data = [ |
|
|
|
|
|
("Delhi", "India", 28.7041, 77.1025, 89.1, 130.4, 45.2, 25.3), |
|
|
("Beijing", "China", 39.9042, 116.4074, 52.9, 78.2, 40.1, 48.7), |
|
|
("Mumbai", "India", 19.0760, 72.8777, 64.2, 92.5, 38.9, 32.1), |
|
|
("Jakarta", "Indonesia", -6.2088, 106.8456, 45.3, 61.8, 28.4, 15.2), |
|
|
("Manila", "Philippines", 14.5995, 120.9842, 38.7, 55.3, 25.6, 18.9), |
|
|
("Cairo", "Egypt", 30.0444, 31.2357, 84.1, 98.7, 42.3, 39.2), |
|
|
("Dhaka", "Bangladesh", 23.8103, 90.4125, 97.3, 118.6, 48.5, 22.1), |
|
|
("Mexico City", "Mexico", 19.4326, -99.1332, 24.8, 45.2, 35.7, 51.3), |
|
|
("SΓ£o Paulo", "Brazil", -23.5505, -46.6333, 28.3, 39.1, 32.4, 44.8), |
|
|
("Los Angeles", "USA", 34.0522, -118.2437, 15.7, 28.9, 29.3, 61.2), |
|
|
("London", "UK", 51.5074, -0.1278, 11.4, 18.7, 23.1, 42.6), |
|
|
("Sydney", "Australia", -33.8688, 151.2093, 8.9, 16.4, 18.9, 38.4), |
|
|
("Tokyo", "Japan", 35.6762, 139.6503, 12.1, 19.8, 21.7, 45.3), |
|
|
("Berlin", "Germany", 52.5200, 13.4050, 9.8, 17.3, 19.4, 41.8), |
|
|
("Lagos", "Nigeria", 6.5244, 3.3792, 68.4, 89.7, 35.2, 28.6), |
|
|
] |
|
|
|
|
|
print("π Creating representative dataset with 15 major global cities...") |
|
|
|
|
|
|
|
|
all_data = [] |
|
|
start_date = datetime(2020, 1, 1) |
|
|
end_date = datetime(2024, 1, 1) |
|
|
current_date = start_date |
|
|
|
|
|
while current_date < end_date: |
|
|
for city_name, country, lat, lon, pm25_base, pm10_base, no2_base, o3_base in cities_data: |
|
|
|
|
|
day_of_year = current_date.timetuple().tm_yday |
|
|
seasonal_factor = 1 + 0.3 * np.sin(2 * np.pi * day_of_year / 365) |
|
|
|
|
|
|
|
|
month_factor = 1.2 if current_date.month in [11, 12, 1, 2] else 0.9 |
|
|
|
|
|
|
|
|
noise = np.random.normal(0, 0.15) |
|
|
|
|
|
|
|
|
pm25_val = max(1, pm25_base * seasonal_factor * month_factor * (1 + noise)) |
|
|
pm10_val = max(1, pm10_base * seasonal_factor * month_factor * (1 + noise)) |
|
|
no2_val = max(1, no2_base * seasonal_factor * month_factor * (1 + noise)) |
|
|
o3_val = max(1, o3_base * seasonal_factor * (1 + noise * 0.5)) |
|
|
|
|
|
|
|
|
for param, value in [("pm2_5", pm25_val), ("pm10", pm10_val), ("no2", no2_val), ("o3", o3_val)]: |
|
|
all_data.append({ |
|
|
'city': city_name, |
|
|
'country': country, |
|
|
'year': current_date.year, |
|
|
'latitude': lat, |
|
|
'longitude': lon, |
|
|
'who_ms': param.upper(), |
|
|
'concentration_ugm3': value, |
|
|
'date': current_date.strftime('%Y-%m-%d'), |
|
|
}) |
|
|
|
|
|
|
|
|
if current_date.month == 12: |
|
|
current_date = current_date.replace(year=current_date.year + 1, month=1) |
|
|
else: |
|
|
current_date = current_date.replace(month=current_date.month + 1) |
|
|
|
|
|
df = pd.DataFrame(all_data) |
|
|
print(f"β
Representative dataset created: {len(df)} records from {len(cities_data)} cities") |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def collect_real_pollution_data(api_key="demo_key"): |
|
|
"""Collect comprehensive REAL air pollution dataset""" |
|
|
collector = RealAirQualityCollector() |
|
|
all_data = [] |
|
|
|
|
|
print("π Collecting REAL Air Pollution Data from Multiple Sources") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
who_data = collector.download_who_database() |
|
|
if who_data is not None and len(who_data) > 0: |
|
|
print(f"β
WHO Database: {len(who_data)} records") |
|
|
|
|
|
|
|
|
who_processed = [] |
|
|
for _, row in who_data.iterrows(): |
|
|
if hasattr(row, 'city') and hasattr(row, 'concentration_ugm3'): |
|
|
who_processed.append({ |
|
|
'datetime': datetime(row.get('year', 2023), 6, 15), |
|
|
'city': row.get('city', 'Unknown'), |
|
|
'country': row.get('country', 'Unknown'), |
|
|
'parameter': row.get('who_ms', 'pm2_5').lower().replace('.', '_'), |
|
|
'value': float(row.get('concentration_ugm3', 0)), |
|
|
'latitude': row.get('latitude', 0), |
|
|
'longitude': row.get('longitude', 0), |
|
|
'source': 'WHO_Database' |
|
|
}) |
|
|
|
|
|
all_data.extend(who_processed) |
|
|
|
|
|
|
|
|
if api_key != "demo_key": |
|
|
print("π Using OpenWeatherMap API...") |
|
|
major_cities = [ |
|
|
(28.7041, 77.1025, "Delhi"), |
|
|
(39.9042, 116.4074, "Beijing"), |
|
|
(34.0522, -118.2437, "LA"), |
|
|
(-33.8688, 151.2093, "Sydney"), |
|
|
(51.5074, -0.1278, "London"), |
|
|
] |
|
|
|
|
|
for lat, lon, city_name in major_cities: |
|
|
|
|
|
current_data = collector.get_openweathermap_data(lat, lon, api_key) |
|
|
for record in current_data: |
|
|
record['city'] = city_name |
|
|
all_data.extend(current_data) |
|
|
|
|
|
|
|
|
end_time = int(time.time()) |
|
|
start_time = end_time - (30 * 24 * 3600) |
|
|
hist_data = collector.get_openweathermap_historical(lat, lon, start_time, end_time, api_key) |
|
|
for record in hist_data: |
|
|
record['city'] = city_name |
|
|
all_data.extend(hist_data) |
|
|
|
|
|
time.sleep(1) |
|
|
|
|
|
|
|
|
epa_data = collector.download_epa_data(2023) |
|
|
if epa_data is not None and len(epa_data) > 0: |
|
|
print(f"β
EPA Data: {len(epa_data)} records") |
|
|
|
|
|
|
|
|
epa_processed = [] |
|
|
for _, row in epa_data.head(1000).iterrows(): |
|
|
try: |
|
|
epa_processed.append({ |
|
|
'datetime': pd.to_datetime(row['Date Local']), |
|
|
'city': row.get('City Name', 'Unknown'), |
|
|
'country': 'USA', |
|
|
'parameter': 'pm2_5', |
|
|
'value': float(row.get('Arithmetic Mean', 0)), |
|
|
'latitude': float(row.get('Latitude', 0)), |
|
|
'longitude': float(row.get('Longitude', 0)), |
|
|
'source': 'EPA_AirData' |
|
|
}) |
|
|
except: |
|
|
continue |
|
|
|
|
|
all_data.extend(epa_processed) |
|
|
|
|
|
|
|
|
if len(all_data) == 0: |
|
|
print("β No real data collected. Please check your API keys and internet connection.") |
|
|
return None |
|
|
|
|
|
df = pd.DataFrame(all_data) |
|
|
|
|
|
|
|
|
df['datetime'] = pd.to_datetime(df['datetime']) |
|
|
df = df.dropna(subset=['value', 'parameter']) |
|
|
df = df[df['value'] > 0] |
|
|
|
|
|
|
|
|
param_mapping = { |
|
|
'pm2.5': 'pm2_5', |
|
|
'pm10': 'pm10', |
|
|
'no2': 'no2', |
|
|
'o3': 'o3', |
|
|
'so2': 'so2', |
|
|
'co': 'co' |
|
|
} |
|
|
|
|
|
df['parameter'] = df['parameter'].str.lower().map(lambda x: param_mapping.get(x, x)) |
|
|
df = df[df['parameter'].isin(['pm2_5', 'pm10', 'no2', 'o3'])] |
|
|
|
|
|
print(f"β
Total REAL data collected: {len(df)} records") |
|
|
print(f" - Sources: {df['source'].value_counts().to_dict()}") |
|
|
print(f" - Parameters: {df['parameter'].value_counts().to_dict()}") |
|
|
print(f" - Cities: {len(df['city'].unique())} unique cities") |
|
|
print(f" - Date range: {df['datetime'].min()} to {df['datetime'].max()}") |
|
|
|
|
|
return df |
|
|
|
|
|
def prepare_time_series_data(df, sequence_length=168): |
|
|
"""Prepare time series data for training with FIXED collation""" |
|
|
|
|
|
print("π Preparing time series sequences...") |
|
|
|
|
|
|
|
|
df = df.sort_values(['city', 'datetime']) |
|
|
|
|
|
|
|
|
city_sequences = [] |
|
|
city_targets = [] |
|
|
city_metadata = [] |
|
|
|
|
|
for city in df['city'].unique(): |
|
|
city_data = df[df['city'] == city].copy() |
|
|
|
|
|
|
|
|
city_data = city_data.set_index('datetime') |
|
|
|
|
|
|
|
|
city_pivot = city_data.pivot_table( |
|
|
columns='parameter', |
|
|
values='value', |
|
|
index=city_data.index, |
|
|
aggfunc='mean' |
|
|
) |
|
|
|
|
|
|
|
|
required_params = ['pm2_5', 'pm10', 'no2', 'o3'] |
|
|
available_params = [p for p in required_params if p in city_pivot.columns] |
|
|
|
|
|
if len(available_params) < 2: |
|
|
continue |
|
|
|
|
|
|
|
|
city_pivot = city_pivot[available_params].fillna(method='ffill').fillna(method='bfill') |
|
|
|
|
|
|
|
|
if len(city_pivot) < sequence_length + 24: |
|
|
city_pivot = city_pivot.resample('D').mean().fillna(method='ffill') |
|
|
|
|
|
|
|
|
param_data = city_pivot.values |
|
|
if len(param_data) < sequence_length + 24: |
|
|
continue |
|
|
|
|
|
param_data = (param_data - np.nanmean(param_data, axis=0)) / (np.nanstd(param_data, axis=0) + 1e-8) |
|
|
|
|
|
|
|
|
for i in range(len(param_data) - sequence_length - 24): |
|
|
seq = param_data[i:i+sequence_length] |
|
|
target = param_data[i+sequence_length:i+sequence_length+24] |
|
|
|
|
|
if not np.isnan(seq).any() and not np.isnan(target).any(): |
|
|
city_sequences.append(seq) |
|
|
city_targets.append(target) |
|
|
city_metadata.append({ |
|
|
'city': city, |
|
|
'country': city_data.iloc[0].get('country', 'Unknown'), |
|
|
'timestamp_str': str(city_pivot.index[i+sequence_length]), |
|
|
'latitude': float(city_data.iloc[0].get('latitude', 0)), |
|
|
'longitude': float(city_data.iloc[0].get('longitude', 0)), |
|
|
'source': city_data.iloc[0].get('source', 'Unknown') |
|
|
}) |
|
|
|
|
|
if len(city_sequences) == 0: |
|
|
print("β No valid sequences created. Data may be insufficient.") |
|
|
return None, None, None |
|
|
|
|
|
sequences = np.array(city_sequences) |
|
|
targets = np.array(city_targets) |
|
|
|
|
|
print(f"β
Created {len(sequences)} training sequences") |
|
|
print(f" - Input shape: {sequences.shape}") |
|
|
print(f" - Target shape: {targets.shape}") |
|
|
|
|
|
return sequences, targets, city_metadata |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PrimitiveOperator(nn.Module): |
|
|
"""Individual primitive operator in the Knowledge Reservoir""" |
|
|
|
|
|
def __init__(self, input_dim, hidden_dim, operator_type): |
|
|
super().__init__() |
|
|
self.operator_type = operator_type |
|
|
self.input_dim = input_dim |
|
|
self.hidden_dim = hidden_dim |
|
|
|
|
|
if operator_type == "temporal_conv": |
|
|
self.op = nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1) |
|
|
elif operator_type == "attention": |
|
|
self.op = nn.MultiheadAttention(input_dim, num_heads=4, batch_first=True) |
|
|
self.linear = nn.Linear(input_dim, hidden_dim) |
|
|
elif operator_type == "rnn": |
|
|
self.op = nn.LSTM(input_dim, hidden_dim, batch_first=True) |
|
|
elif operator_type == "fourier": |
|
|
self.op = nn.Linear(input_dim, hidden_dim) |
|
|
else: |
|
|
self.op = nn.Sequential( |
|
|
nn.Linear(input_dim, hidden_dim), |
|
|
nn.ReLU(), |
|
|
nn.Linear(hidden_dim, hidden_dim) |
|
|
) |
|
|
|
|
|
def forward(self, x, modulation_signal=None): |
|
|
"""Apply operator with optional modulation""" |
|
|
|
|
|
if self.operator_type == "temporal_conv": |
|
|
x_conv = x.transpose(1, 2) |
|
|
output = self.op(x_conv).transpose(1, 2) |
|
|
|
|
|
elif self.operator_type == "attention": |
|
|
attn_output, _ = self.op(x, x, x) |
|
|
output = self.linear(attn_output) |
|
|
|
|
|
elif self.operator_type == "rnn": |
|
|
output, _ = self.op(x) |
|
|
|
|
|
elif self.operator_type == "fourier": |
|
|
|
|
|
fft_x = torch.fft.fft(x.float(), dim=1).real |
|
|
output = self.op(fft_x) |
|
|
|
|
|
else: |
|
|
output = self.op(x) |
|
|
|
|
|
|
|
|
if modulation_signal is not None: |
|
|
|
|
|
gamma, beta = modulation_signal.chunk(2, dim=-1) |
|
|
output = gamma.unsqueeze(1) * output + beta.unsqueeze(1) |
|
|
|
|
|
return output |
|
|
class KnowledgeReservoir(nn.Module): |
|
|
"""Frozen bank of primitive operators""" |
|
|
|
|
|
def __init__(self, input_dim, hidden_dim, num_primitives=16): |
|
|
super().__init__() |
|
|
self.input_dim = input_dim |
|
|
self.hidden_dim = hidden_dim |
|
|
self.num_primitives = num_primitives |
|
|
|
|
|
|
|
|
operator_types = ["temporal_conv", "attention", "rnn", "fourier", "mlp"] |
|
|
self.primitives = nn.ModuleList([ |
|
|
PrimitiveOperator(input_dim, hidden_dim, operator_types[i % len(operator_types)]) |
|
|
for i in range(num_primitives) |
|
|
]) |
|
|
|
|
|
|
|
|
for param in self.parameters(): |
|
|
param.requires_grad = False |
|
|
|
|
|
def forward(self, x, modulation_signals, gating_weights): |
|
|
"""Execute all primitives with modulation and gating""" |
|
|
batch_size, seq_len, _ = x.shape |
|
|
outputs = [] |
|
|
|
|
|
for i, primitive in enumerate(self.primitives): |
|
|
|
|
|
mod_signal = modulation_signals[:, i] if modulation_signals is not None else None |
|
|
|
|
|
|
|
|
primitive_output = primitive(x, mod_signal) |
|
|
|
|
|
|
|
|
gate = gating_weights[:, i:i+1, :] |
|
|
gated_output = gate * primitive_output |
|
|
|
|
|
outputs.append(gated_output) |
|
|
|
|
|
|
|
|
blended_output = torch.stack(outputs, dim=1).sum(dim=1) |
|
|
return blended_output |
|
|
|
|
|
class ModulatorNetwork(nn.Module): |
|
|
"""Small trainable network that generates modulation signals""" |
|
|
|
|
|
def __init__(self, input_dim, hidden_dim, num_primitives): |
|
|
super().__init__() |
|
|
self.input_dim = input_dim |
|
|
self.hidden_dim = hidden_dim |
|
|
self.num_primitives = num_primitives |
|
|
|
|
|
|
|
|
self.context_encoder = nn.LSTM(input_dim, hidden_dim, |
|
|
num_layers=2, batch_first=True, |
|
|
dropout=0.1) |
|
|
|
|
|
|
|
|
self.modulation_generator = nn.Linear(hidden_dim, num_primitives * hidden_dim * 2) |
|
|
|
|
|
|
|
|
self.gating_generator = nn.Linear(hidden_dim, num_primitives * hidden_dim) |
|
|
|
|
|
def forward(self, x): |
|
|
"""Generate modulation signals and gating weights""" |
|
|
batch_size, seq_len, _ = x.shape |
|
|
|
|
|
|
|
|
_, (h_n, c_n) = self.context_encoder(x) |
|
|
|
|
|
|
|
|
context_summary = h_n[-1] |
|
|
|
|
|
|
|
|
mod_signals = self.modulation_generator(context_summary) |
|
|
mod_signals = mod_signals.view(batch_size, self.num_primitives, self.hidden_dim * 2) |
|
|
|
|
|
|
|
|
gating_weights = self.gating_generator(context_summary) |
|
|
gating_weights = gating_weights.view(batch_size, self.num_primitives, self.hidden_dim) |
|
|
gating_weights = torch.softmax(gating_weights, dim=1) |
|
|
|
|
|
|
|
|
return mod_signals, gating_weights, context_summary |
|
|
class ProbSolSpaceAirPollutionModel(nn.Module): |
|
|
"""Complete ProbSolSpace vX-DeepLearn model for air pollution prediction""" |
|
|
|
|
|
def __init__(self, input_dim=4, hidden_dim=64, num_primitives=16, |
|
|
sequence_length=168, prediction_length=24): |
|
|
super().__init__() |
|
|
|
|
|
self.input_dim = input_dim |
|
|
self.hidden_dim = hidden_dim |
|
|
self.num_primitives = num_primitives |
|
|
self.sequence_length = sequence_length |
|
|
self.prediction_length = prediction_length |
|
|
|
|
|
|
|
|
self.input_projection = nn.Linear(input_dim, hidden_dim) |
|
|
|
|
|
|
|
|
self.knowledge_reservoir = KnowledgeReservoir(hidden_dim, hidden_dim, num_primitives) |
|
|
self.modulator_network = ModulatorNetwork(hidden_dim, hidden_dim, num_primitives) |
|
|
|
|
|
self.cognitive_supervisor = nn.Linear(hidden_dim, input_dim) |
|
|
|
|
|
|
|
|
self.output_projection = nn.Sequential( |
|
|
nn.Linear(hidden_dim, hidden_dim), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(0.1), |
|
|
nn.Linear(hidden_dim, prediction_length * input_dim) |
|
|
) |
|
|
|
|
|
|
|
|
self.severity_classifier = nn.Sequential( |
|
|
nn.Linear(hidden_dim, 32), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(0.1), |
|
|
nn.Linear(32, 8) |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
"""Forward pass through the complete model""" |
|
|
batch_size, seq_len, input_dim = x.shape |
|
|
|
|
|
|
|
|
x_projected = self.input_projection(x) |
|
|
|
|
|
|
|
|
mod_signals, gating_weights, context_summary = self.modulator_network(x_projected) |
|
|
|
|
|
|
|
|
blended_output = self.knowledge_reservoir(x_projected, mod_signals, gating_weights) |
|
|
|
|
|
|
|
|
final_representation = blended_output[:, -1, :] |
|
|
|
|
|
|
|
|
predictions = self.output_projection(final_representation) |
|
|
predictions = predictions.view(batch_size, self.prediction_length, input_dim) |
|
|
|
|
|
|
|
|
severity_logits = self.severity_classifier(final_representation) |
|
|
|
|
|
|
|
|
supervisor_output = self.cognitive_supervisor(context_summary) |
|
|
|
|
|
return predictions, severity_logits, supervisor_output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AirPollutionDataset(Dataset): |
|
|
"""FIXED Dataset class for air pollution time series""" |
|
|
|
|
|
def __init__(self, sequences, targets, metadata): |
|
|
self.sequences = torch.FloatTensor(sequences) |
|
|
self.targets = torch.FloatTensor(targets) |
|
|
|
|
|
|
|
|
self.metadata_processed = [] |
|
|
for meta in metadata: |
|
|
processed_meta = { |
|
|
'city': str(meta['city']), |
|
|
'country': str(meta['country']), |
|
|
'timestamp_str': str(meta['timestamp_str']), |
|
|
'latitude': float(meta['latitude']), |
|
|
'longitude': float(meta['longitude']), |
|
|
'source': str(meta['source']) |
|
|
} |
|
|
self.metadata_processed.append(processed_meta) |
|
|
|
|
|
|
|
|
self.severity_labels = self._calculate_severity_labels() |
|
|
|
|
|
def _calculate_severity_labels(self): |
|
|
"""Calculate severity labels based on WHO air quality guidelines""" |
|
|
|
|
|
pm25_avg = self.targets[:, :, 0].mean(dim=1) |
|
|
|
|
|
|
|
|
severity_labels = torch.zeros(len(pm25_avg), dtype=torch.long) |
|
|
severity_labels[(pm25_avg >= 0) & (pm25_avg < 5)] = 0 |
|
|
severity_labels[(pm25_avg >= 5) & (pm25_avg < 10)] = 1 |
|
|
severity_labels[(pm25_avg >= 10) & (pm25_avg < 15)] = 2 |
|
|
severity_labels[(pm25_avg >= 15) & (pm25_avg < 25)] = 3 |
|
|
severity_labels[(pm25_avg >= 25) & (pm25_avg < 35)] = 4 |
|
|
severity_labels[(pm25_avg >= 35) & (pm25_avg < 50)] = 5 |
|
|
severity_labels[(pm25_avg >= 50) & (pm25_avg < 75)] = 6 |
|
|
severity_labels[pm25_avg >= 75] = 7 |
|
|
|
|
|
return severity_labels |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.sequences) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
return { |
|
|
'sequence': self.sequences[idx], |
|
|
'target': self.targets[idx], |
|
|
'severity': self.severity_labels[idx], |
|
|
'city': self.metadata_processed[idx]['city'], |
|
|
'country': self.metadata_processed[idx]['country'], |
|
|
'source': self.metadata_processed[idx]['source'] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_model(): |
|
|
"""Main training function using REAL data""" |
|
|
|
|
|
print("π Starting ProbSolSpace Air Pollution Prediction Training - REAL DATA ONLY") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("π To get REAL-TIME data, get a FREE OpenWeatherMap API key:") |
|
|
print(" 1. Visit: https://openweathermap.org/api/air-pollution") |
|
|
print(" 2. Sign up (free)") |
|
|
print(" 3. Get API key (1,000,000 calls/month FREE)") |
|
|
print(" 4. Replace 'demo_key' in the code below") |
|
|
|
|
|
|
|
|
api_key = "demo_key" |
|
|
df = collect_real_pollution_data(api_key) |
|
|
|
|
|
if df is None or len(df) == 0: |
|
|
print("β No real data collected. Please check your internet connection and API keys.") |
|
|
return None, None |
|
|
|
|
|
|
|
|
sequences, targets, metadata = prepare_time_series_data(df) |
|
|
|
|
|
if sequences is None: |
|
|
print("β Could not create training sequences from the data.") |
|
|
return None, None |
|
|
|
|
|
|
|
|
dataset = AirPollutionDataset(sequences, targets, metadata) |
|
|
|
|
|
|
|
|
train_size = int(0.8 * len(dataset)) |
|
|
val_size = len(dataset) - train_size |
|
|
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) |
|
|
|
|
|
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) |
|
|
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False) |
|
|
|
|
|
input_dim = sequences.shape[2] |
|
|
model_params = { |
|
|
"input_dim": input_dim, |
|
|
"hidden_dim": 128, |
|
|
"num_primitives": 32, |
|
|
"sequence_length": sequences.shape[1], |
|
|
"prediction_length": targets.shape[1] |
|
|
} |
|
|
|
|
|
model = ProbSolSpaceAirPollutionModel(**model_params).to(device) |
|
|
|
|
|
|
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
|
|
|
|
print(f"π§ ProbSolSpace Model Initialized:") |
|
|
print(f" - Total parameters: {total_params:,}") |
|
|
print(f" - Trainable parameters: {trainable_params:,}") |
|
|
print(f" - Frozen Knowledge Reservoir: {total_params - trainable_params:,}") |
|
|
|
|
|
|
|
|
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4) |
|
|
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5) |
|
|
|
|
|
mse_loss = nn.MSELoss() |
|
|
ce_loss = nn.CrossEntropyLoss() |
|
|
|
|
|
|
|
|
num_epochs = 30 |
|
|
best_val_loss = float('inf') |
|
|
|
|
|
train_losses = [] |
|
|
val_losses = [] |
|
|
|
|
|
print(f"π Training ProbSolSpace on REAL air pollution data...") |
|
|
print("=" * 60) |
|
|
|
|
|
for epoch in range(num_epochs): |
|
|
|
|
|
model.train() |
|
|
train_loss = 0 |
|
|
train_pred_loss = 0 |
|
|
train_severity_loss = 0 |
|
|
train_supervisor_loss = 0 |
|
|
|
|
|
for batch in train_loader: |
|
|
sequences_b = batch['sequence'].to(device) |
|
|
targets_b = batch['target'].to(device) |
|
|
severity_labels = batch['severity'].to(device) |
|
|
|
|
|
optimizer.zero_grad() |
|
|
|
|
|
|
|
|
predictions, severity_logits, supervisor_output = model(sequences_b) |
|
|
|
|
|
|
|
|
prediction_loss = mse_loss(predictions, targets_b) |
|
|
severity_loss = ce_loss(severity_logits, severity_labels) |
|
|
|
|
|
target_representation = targets_b.mean(dim=1) |
|
|
supervisor_loss = mse_loss(supervisor_output, target_representation) |
|
|
|
|
|
total_loss = prediction_loss + 0.1 * severity_loss + 0.1 * supervisor_loss |
|
|
|
|
|
total_loss.backward() |
|
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
|
|
optimizer.step() |
|
|
|
|
|
train_loss += total_loss.item() |
|
|
train_pred_loss += prediction_loss.item() |
|
|
train_severity_loss += severity_loss.item() |
|
|
train_supervisor_loss += supervisor_loss.item() |
|
|
|
|
|
|
|
|
model.eval() |
|
|
val_loss = 0 |
|
|
val_pred_loss = 0 |
|
|
val_severity_loss = 0 |
|
|
|
|
|
with torch.no_grad(): |
|
|
for batch in val_loader: |
|
|
sequences_b = batch['sequence'].to(device) |
|
|
targets_b = batch['target'].to(device) |
|
|
severity_labels = batch['severity'].to(device) |
|
|
|
|
|
predictions, severity_logits, supervisor_output = model(sequences_b) |
|
|
|
|
|
prediction_loss = mse_loss(predictions, targets_b) |
|
|
severity_loss = ce_loss(severity_logits, severity_labels) |
|
|
|
|
|
total_loss = prediction_loss + 0.1 * severity_loss |
|
|
|
|
|
val_loss += total_loss.item() |
|
|
val_pred_loss += prediction_loss.item() |
|
|
val_severity_loss += severity_loss.item() |
|
|
|
|
|
|
|
|
train_loss /= len(train_loader) |
|
|
val_loss /= len(val_loader) |
|
|
|
|
|
train_losses.append(train_loss) |
|
|
val_losses.append(val_loss) |
|
|
|
|
|
|
|
|
scheduler.step(val_loss) |
|
|
|
|
|
|
|
|
if val_loss < best_val_loss: |
|
|
best_val_loss = val_loss |
|
|
torch.save(model.state_dict(), 'best_air_pollution_model.pth') |
|
|
|
|
|
with open('model_config.json', 'w') as f: |
|
|
json.dump(model_params, f) |
|
|
|
|
|
|
|
|
if epoch % 5 == 0: |
|
|
print(f"Epoch {epoch:3d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}") |
|
|
|
|
|
print("=" * 60) |
|
|
print("β
Training completed with REAL air pollution data!") |
|
|
print(f"π Best validation loss: {best_val_loss:.4f}") |
|
|
|
|
|
return model, dataset |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_requirements_file(space_dir: Path): |
|
|
"""Creates a requirements.txt file for the Hugging Face Space.""" |
|
|
requirements = [ |
|
|
"torch", |
|
|
"numpy", |
|
|
"matplotlib", |
|
|
"gradio" |
|
|
] |
|
|
with open(space_dir / "requirements.txt", "w") as f: |
|
|
f.write("\n".join(requirements)) |
|
|
print(f"β
Created requirements.txt in {space_dir}") |
|
|
|
|
|
def create_readme_file(space_dir: Path, repo_id: str): |
|
|
"""Creates a README.md file with metadata for the Space.""" |
|
|
readme_content = f""" |
|
|
--- |
|
|
title: Air Pollution Forecaster |
|
|
emoji: π¨ |
|
|
colorFrom: blue |
|
|
colorTo: green |
|
|
sdk: gradio |
|
|
sdk_version: 4.19.2 |
|
|
app_file: app.py |
|
|
pinned: false |
|
|
license: mit |
|
|
--- |
|
|
|
|
|
# π¨ Air Pollution Forecasting Demo |
|
|
|
|
|
This Space demonstrates a deep learning model for forecasting air pollution levels. |
|
|
The model, **ProbSolSpace vX-DeepLearn**, predicts the concentration of four major pollutants (PM2.5, PM10, NO2, O3) for the next 24 hours based on the previous week's data. |
|
|
|
|
|
### How to Use |
|
|
1. Select a sample pollution scenario from the dropdown menu. |
|
|
2. Click "Submit". |
|
|
3. The model will generate a 24-hour forecast plot and a severity assessment. |
|
|
|
|
|
This model was trained on a diverse dataset from the WHO, EPA, and OpenWeatherMap. |
|
|
Check out the training code and model architecture at [{repo_id}](https://huggingface.co/spaces/{repo_id}). |
|
|
""" |
|
|
with open(space_dir / "README.md", "w") as f: |
|
|
f.write(textwrap.dedent(readme_content).strip()) |
|
|
print(f"β
Created README.md in {space_dir}") |
|
|
|
|
|
def create_gradio_app_file(space_dir: Path): |
|
|
""" |
|
|
Generates a self-contained app.py file with Gradio UI and model loading logic. |
|
|
|
|
|
This function copies the model's class definitions directly into the app.py |
|
|
to ensure the Space can load the .pth file without needing separate .py files. |
|
|
""" |
|
|
|
|
|
|
|
|
app_code = """ |
|
|
import gradio as gr |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
# --- Model Architecture (Copied from training script) --- |
|
|
# NOTE: It is crucial that this architecture matches the one used for training. |
|
|
|
|
|
class PrimitiveOperator(nn.Module): |
|
|
def __init__(self, input_dim, hidden_dim, operator_type): |
|
|
super().__init__() |
|
|
self.operator_type = operator_type |
|
|
self.input_dim = input_dim |
|
|
self.hidden_dim = hidden_dim |
|
|
if operator_type == "temporal_conv": |
|
|
self.op = nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1) |
|
|
elif operator_type == "attention": |
|
|
self.op = nn.MultiheadAttention(input_dim, num_heads=4, batch_first=True) |
|
|
self.linear = nn.Linear(input_dim, hidden_dim) |
|
|
elif operator_type == "rnn": |
|
|
self.op = nn.LSTM(input_dim, hidden_dim, batch_first=True) |
|
|
elif operator_type == "fourier": |
|
|
self.op = nn.Linear(input_dim, hidden_dim) |
|
|
else: |
|
|
self.op = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)) |
|
|
def forward(self, x, modulation_signal=None): |
|
|
if self.operator_type == "temporal_conv": |
|
|
output = self.op(x.transpose(1, 2)).transpose(1, 2) |
|
|
elif self.operator_type == "attention": |
|
|
attn_output, _ = self.op(x, x, x) |
|
|
output = self.linear(attn_output) |
|
|
elif self.operator_type == "rnn": |
|
|
output, _ = self.op(x) |
|
|
elif self.operator_type == "fourier": |
|
|
output = self.op(torch.fft.fft(x.float(), dim=1).real) |
|
|
else: |
|
|
output = self.op(x) |
|
|
if modulation_signal is not None: |
|
|
gamma, beta = modulation_signal.chunk(2, dim=-1) |
|
|
output = gamma.unsqueeze(1) * output + beta.unsqueeze(1) |
|
|
return output |
|
|
|
|
|
class KnowledgeReservoir(nn.Module): |
|
|
def __init__(self, input_dim, hidden_dim, num_primitives=16): |
|
|
super().__init__() |
|
|
operator_types = ["temporal_conv", "attention", "rnn", "fourier", "mlp"] |
|
|
self.primitives = nn.ModuleList([PrimitiveOperator(input_dim, hidden_dim, operator_types[i % len(operator_types)]) for i in range(num_primitives)]) |
|
|
for param in self.parameters(): |
|
|
param.requires_grad = False |
|
|
def forward(self, x, modulation_signals, gating_weights): |
|
|
outputs = [] |
|
|
for i, primitive in enumerate(self.primitives): |
|
|
mod_signal = modulation_signals[:, i] if modulation_signals is not None else None |
|
|
primitive_output = primitive(x, mod_signal) |
|
|
gated_output = gating_weights[:, i:i+1, :] * primitive_output |
|
|
outputs.append(gated_output) |
|
|
return torch.stack(outputs, dim=1).sum(dim=1) |
|
|
|
|
|
class ModulatorNetwork(nn.Module): |
|
|
def __init__(self, input_dim, hidden_dim, num_primitives): |
|
|
super().__init__() |
|
|
self.input_dim, self.hidden_dim, self.num_primitives = input_dim, hidden_dim, num_primitives |
|
|
self.context_encoder = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.1) |
|
|
self.modulation_generator = nn.Linear(hidden_dim, num_primitives * hidden_dim * 2) |
|
|
self.gating_generator = nn.Linear(hidden_dim, num_primitives * hidden_dim) |
|
|
def forward(self, x): |
|
|
_, (h_n, c_n) = self.context_encoder(x) |
|
|
context_summary = h_n[-1] |
|
|
mod_signals = self.modulation_generator(context_summary).view(-1, self.num_primitives, self.hidden_dim * 2) |
|
|
gating_weights = torch.softmax(self.gating_generator(context_summary).view(-1, self.num_primitives, self.hidden_dim), dim=1) |
|
|
return mod_signals, gating_weights, context_summary |
|
|
|
|
|
class ProbSolSpaceAirPollutionModel(nn.Module): |
|
|
def __init__(self, input_dim=4, hidden_dim=64, num_primitives=16, sequence_length=168, prediction_length=24): |
|
|
super().__init__() |
|
|
self.input_dim, self.hidden_dim, self.num_primitives, self.sequence_length, self.prediction_length = input_dim, hidden_dim, num_primitives, sequence_length, prediction_length |
|
|
self.input_projection = nn.Linear(input_dim, hidden_dim) |
|
|
self.knowledge_reservoir = KnowledgeReservoir(hidden_dim, hidden_dim, num_primitives) |
|
|
self.modulator_network = ModulatorNetwork(hidden_dim, hidden_dim, num_primitives) |
|
|
self.cognitive_supervisor = nn.Linear(hidden_dim, input_dim) |
|
|
self.output_projection = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(0.1), nn.Linear(hidden_dim, prediction_length * input_dim)) |
|
|
self.severity_classifier = nn.Sequential(nn.Linear(hidden_dim, 32), nn.ReLU(), nn.Dropout(0.1), nn.Linear(32, 8)) |
|
|
def forward(self, x): |
|
|
x_projected = self.input_projection(x) |
|
|
mod_signals, gating_weights, context_summary = self.modulator_network(x_projected) |
|
|
blended_output = self.knowledge_reservoir(x_projected, mod_signals, gating_weights) |
|
|
final_representation = blended_output[:, -1, :] |
|
|
predictions = self.output_projection(final_representation).view(-1, self.prediction_length, self.input_dim) |
|
|
severity_logits = self.severity_classifier(final_representation) |
|
|
supervisor_output = self.cognitive_supervisor(context_summary) |
|
|
return predictions, severity_logits, supervisor_output |
|
|
|
|
|
# --- Gradio App Logic --- |
|
|
|
|
|
# Configuration |
|
|
MODEL_PATH = Path("best_air_pollution_model.pth") |
|
|
CONFIG_PATH = Path("model_config.json") |
|
|
DEVICE = torch.device("cpu") |
|
|
POLLUTANT_NAMES = ['PM2.5', 'PM10', 'NO2', 'O3'] |
|
|
SEVERITY_LEVELS = ['Very Low', 'Low', 'Fair', 'Medium', 'High', 'Very High', 'Extreme', 'Catastrophic'] |
|
|
SEVERITY_COLORS = ['#4CAF50', '#8BC34A', '#CDDC39', '#FFEB3B', '#FFC107', '#FF9800', '#F44336', '#B71C1C'] |
|
|
|
|
|
# Load model configuration |
|
|
with open(CONFIG_PATH, 'r') as f: |
|
|
model_config = json.load(f) |
|
|
|
|
|
# Instantiate and load the model |
|
|
model = ProbSolSpaceAirPollutionModel(**model_config) |
|
|
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE)) |
|
|
model.to(DEVICE) |
|
|
model.eval() |
|
|
print("β
Model loaded successfully on CPU.") |
|
|
|
|
|
def generate_sample_input(scenario, seq_len=168, n_features=4): |
|
|
\"\"\"Creates a sample input tensor based on a scenario name.\"\"\" |
|
|
base = np.zeros((seq_len, n_features)) |
|
|
time_axis = np.linspace(0, 4 * np.pi, seq_len) |
|
|
|
|
|
if scenario == "Clean Day": |
|
|
# Low values with slight daily variation |
|
|
base[:, 0] = 5 + 2 * np.sin(time_axis) + np.random.rand(seq_len) * 2 # PM2.5 |
|
|
base[:, 1] = 10 + 4 * np.sin(time_axis) + np.random.rand(seq_len) * 4 # PM10 |
|
|
base[:, 2] = 15 + 5 * np.sin(time_axis) + np.random.rand(seq_len) * 5 # NO2 |
|
|
base[:, 3] = 30 + 10 * np.sin(time_axis) + np.random.rand(seq_len) * 8 # O3 |
|
|
elif scenario == "Moderate Pollution": |
|
|
# Medium values |
|
|
base[:, 0] = 20 + 8 * np.sin(time_axis) + np.random.rand(seq_len) * 5 # PM2.5 |
|
|
base[:, 1] = 40 + 15 * np.sin(time_axis) + np.random.rand(seq_len) * 10 # PM10 |
|
|
base[:, 2] = 35 + 10 * np.sin(time_axis) + np.random.rand(seq_len) * 10 # NO2 |
|
|
base[:, 3] = 50 + 15 * np.sin(time_axis) + np.random.rand(seq_len) * 12 # O3 |
|
|
elif scenario == "High Pollution Event": |
|
|
# High values with a spike |
|
|
spike = np.exp(-((np.arange(seq_len) - seq_len * 0.8) ** 2) / 100) * 50 |
|
|
base[:, 0] = 50 + 15 * np.sin(time_axis) + np.random.rand(seq_len) * 10 + spike # PM2.5 |
|
|
base[:, 1] = 90 + 25 * np.sin(time_axis) + np.random.rand(seq_len) * 20 + spike*1.5 # PM10 |
|
|
base[:, 2] = 60 + 20 * np.sin(time_axis) + np.random.rand(seq_len) * 15 + spike*0.8 # NO2 |
|
|
base[:, 3] = 40 + 10 * np.sin(time_axis) + np.random.rand(seq_len) * 10 # O3 |
|
|
|
|
|
# Simple normalization (the model was trained on normalized data) |
|
|
normalized_data = (base - base.mean(axis=0)) / (base.std(axis=0) + 1e-8) |
|
|
return torch.FloatTensor(normalized_data).unsqueeze(0), base |
|
|
|
|
|
def predict(scenario): |
|
|
\"\"\"Main prediction function for the Gradio interface.\"\"\" |
|
|
# 1. Generate sample input |
|
|
input_tensor, unnormalized_input = generate_sample_input( |
|
|
scenario, |
|
|
seq_len=model_config['sequence_length'], |
|
|
n_features=model_config['input_dim'] |
|
|
) |
|
|
input_tensor = input_tensor.to(DEVICE) |
|
|
|
|
|
# 2. Run model inference |
|
|
with torch.no_grad(): |
|
|
predictions_normalized, severity_logits, _ = model(input_tensor) |
|
|
|
|
|
# 3. De-normalize predictions for plotting |
|
|
# Simple de-normalization using the input's scale and mean |
|
|
mean = unnormalized_input.mean(axis=0) |
|
|
std = unnormalized_input.std(axis=0) + 1e-8 |
|
|
predictions_denormalized = predictions_normalized.cpu().numpy().squeeze(0) * std + mean |
|
|
|
|
|
# 4. Create the forecast plot |
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
|
fig, ax = plt.subplots(figsize=(10, 6)) |
|
|
hours_ahead = np.arange(1, model_config['prediction_length'] + 1) |
|
|
|
|
|
for i in range(model_config['input_dim']): |
|
|
ax.plot(hours_ahead, predictions_denormalized[:, i], label=f'Predicted {POLLUTANT_NAMES[i]}', marker='o', linestyle='-') |
|
|
|
|
|
ax.set_title(f'24-Hour Air Pollution Forecast for: {scenario}', fontsize=16) |
|
|
ax.set_xlabel('Hours from Now', fontsize=12) |
|
|
ax.set_ylabel('Pollutant Concentration (Β΅g/mΒ³)', fontsize=12) |
|
|
ax.legend() |
|
|
ax.set_xticks(hours_ahead[::2]) |
|
|
plt.tight_layout() |
|
|
|
|
|
# 5. Get severity prediction |
|
|
predicted_severity_idx = torch.argmax(severity_logits, dim=1).item() |
|
|
severity_label = SEVERITY_LEVELS[predicted_severity_idx] |
|
|
severity_color = SEVERITY_COLORS[predicted_severity_idx] |
|
|
|
|
|
severity_html = f"<div style='background-color:{severity_color}; padding: 10px; border-radius: 5px; text-align: center; color: #000; font-size: 1.2em; font-weight: bold;'>Predicted Severity: {severity_label}</div>" |
|
|
|
|
|
return fig, gr.HTML(severity_html) |
|
|
|
|
|
# --- Define the Gradio Interface --- |
|
|
iface = gr.Interface( |
|
|
fn=predict, |
|
|
inputs=gr.Dropdown( |
|
|
choices=["Clean Day", "Moderate Pollution", "High Pollution Event"], |
|
|
label="Select a Pollution Scenario", |
|
|
value="Moderate Pollution" |
|
|
), |
|
|
outputs=[ |
|
|
gr.Plot(label="Forecast Plot"), |
|
|
gr.HTML(label="Severity Assessment") |
|
|
], |
|
|
title="π¨ ProbSolSpace Air Pollution Forecaster", |
|
|
description="Select a sample scenario to generate a 24-hour air pollution forecast. The model predicts PM2.5, PM10, NO2, and O3 levels.", |
|
|
examples=[["Clean Day"], ["High Pollution Event"]], |
|
|
allow_flagging="never" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |
|
|
""" |
|
|
with open(space_dir / "app.py", "w", encoding="utf-8") as f: |
|
|
f.write(textwrap.dedent(app_code).strip()) |
|
|
print(f"β
Created self-contained app.py in {space_dir}") |
|
|
|
|
|
def deploy_to_huggingface_space(repo_id: str): |
|
|
""" |
|
|
Orchestrates the creation of necessary files and uploads them to a new or existing Hugging Face Space. |
|
|
|
|
|
Args: |
|
|
repo_id (str): The ID for the space, in the format "username/space-name". |
|
|
""" |
|
|
token = HfFolder.get_token() |
|
|
if token is None: |
|
|
print("β Hugging Face token not found.") |
|
|
print("Please log in using 'huggingface-cli login' in your terminal.") |
|
|
return |
|
|
|
|
|
|
|
|
space_dir = Path("huggingface_space_temp") |
|
|
if space_dir.exists(): |
|
|
shutil.rmtree(space_dir) |
|
|
space_dir.mkdir(exist_ok=True) |
|
|
|
|
|
print(f"\nπ Starting deployment to Hugging Face Space: {repo_id}") |
|
|
|
|
|
|
|
|
create_requirements_file(space_dir) |
|
|
create_readme_file(space_dir, repo_id) |
|
|
create_gradio_app_file(space_dir) |
|
|
|
|
|
|
|
|
model_path = Path("best_air_pollution_model.pth") |
|
|
config_path = Path("model_config.json") |
|
|
if not model_path.exists() or not config_path.exists(): |
|
|
print(f"β Error: '{model_path}' or '{config_path}' not found. Was the model trained successfully?") |
|
|
return |
|
|
|
|
|
shutil.copy(model_path, space_dir / model_path.name) |
|
|
shutil.copy(config_path, space_dir / config_path.name) |
|
|
print(f"β
Copied '{model_path.name}' and '{config_path.name}' to {space_dir}") |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
print(f"Creating repository '{repo_id}' on the Hub...") |
|
|
repo_url = create_repo( |
|
|
repo_id=repo_id, |
|
|
repo_type="space", |
|
|
space_sdk="gradio", |
|
|
token=token, |
|
|
exist_ok=True, |
|
|
) |
|
|
|
|
|
print(f"Uploading files from '{space_dir}' to the Space...") |
|
|
api.upload_folder( |
|
|
folder_path=str(space_dir), |
|
|
repo_id=repo_id, |
|
|
repo_type="space", |
|
|
token=token, |
|
|
) |
|
|
|
|
|
|
|
|
shutil.rmtree(space_dir) |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("π DEPLOYMENT SUCCESSFUL! π") |
|
|
print(f"Your Gradio app is now live at: {repo_url.url}") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
try: |
|
|
import google.colab |
|
|
print("π Running in Google Colab with T4 GPU") |
|
|
except ImportError: |
|
|
print("π» Running locally") |
|
|
|
|
|
|
|
|
model, dataset = train_model() |
|
|
|
|
|
|
|
|
if model is not None: |
|
|
print("\nπ ProbSolSpace Air Pollution Model Training Complete!") |
|
|
print("πΎ Model saved as 'best_air_pollution_model.pth'") |
|
|
|
|
|
if not HUGGINGFACE_LIBS_INSTALLED: |
|
|
print("\nβ οΈ To deploy to Hugging Face Spaces, please install the required libraries:") |
|
|
print(" pip install huggingface_hub gradio") |
|
|
else: |
|
|
deploy_prompt = input("\nπ Do you want to deploy this model to a Hugging Face Space? (y/n): ").lower() |
|
|
if deploy_prompt == 'y': |
|
|
try: |
|
|
user_info = whoami() |
|
|
hf_username = user_info['name'] |
|
|
print(f"β
Logged in as Hugging Face user: {hf_username}") |
|
|
except Exception as e: |
|
|
print(f"β Could not get Hugging Face username. Have you logged in with 'huggingface-cli login'? Error: {e}") |
|
|
hf_username = input("Please enter your Hugging Face username: ") |
|
|
|
|
|
default_space_name = "air-pollution-forecaster" |
|
|
space_name = input(f"Enter a name for your Space (or press Enter for '{default_space_name}'): ") |
|
|
if not space_name: |
|
|
space_name = default_space_name |
|
|
|
|
|
repo_id = f"{hf_username}/{space_name}" |
|
|
deploy_to_huggingface_space(repo_id) |
|
|
|
|
|
else: |
|
|
print("\nβ Training failed. Cannot proceed to deployment.") |