Spaces:
Sleeping
Sleeping
import training code from main branch
Browse files
{src/ml → app}/jedha_final_project.ipynb
RENAMED
|
File without changes
|
src/ml/process_for_ml.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/ml/requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ pyarrow
|
|
| 2 |
pandas
|
| 3 |
scikit-learn
|
| 4 |
mlflow
|
| 5 |
-
boto3
|
|
|
|
|
|
| 2 |
pandas
|
| 3 |
scikit-learn
|
| 4 |
mlflow
|
| 5 |
+
boto3
|
| 6 |
+
python-dotenv
|
src/train_docker/2024_semester2_merged_v2.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a46526982934366fbae9da1f4b0abcea000140c6070ae1f2ce966c7eec8869a
|
| 3 |
+
size 1302875
|
src/train_docker/Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python 3.12.11 image as base
|
| 2 |
+
FROM python:3.12.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory inside the container
|
| 5 |
+
WORKDIR /src/train_docker
|
| 6 |
+
|
| 7 |
+
# Copy requirements.txt first (for better caching)
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
RUN pip install --upgrade pip && \
|
| 12 |
+
pip install -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# Copy the rest of the application code
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# Default command to run your training script
|
| 18 |
+
CMD ["python", "train.py"]
|
src/train_docker/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pyarrow>=20.0.0
|
| 2 |
+
pandas>=2.3.1
|
| 3 |
+
scikit-learn>=1.7.0
|
| 4 |
+
mlflow>=3.1.1
|
| 5 |
+
boto3>=1.39.3
|
| 6 |
+
python-dotenv
|
src/train_docker/train.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This script is used to train a model and save it to S3.
|
| 3 |
+
It will require the following environment variables:
|
| 4 |
+
- AWS_ACCESS_KEY_ID
|
| 5 |
+
- AWS_SECRET_ACCESS_KEY
|
| 6 |
+
- AWS_REGION
|
| 7 |
+
- MLFLOW_TRACKING_URI
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import mlflow
|
| 13 |
+
import mlflow.sklearn
|
| 14 |
+
import pickle
|
| 15 |
+
import boto3
|
| 16 |
+
import datetime
|
| 17 |
+
from sklearn.model_selection import train_test_split
|
| 18 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 19 |
+
from sklearn.model_selection import GridSearchCV
|
| 20 |
+
import os
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
|
| 23 |
+
def average_columns_with_suffix(df, suffix):
|
| 24 |
+
"""Calculate average of columns with given suffix."""
|
| 25 |
+
# Filter columns matching the suffix
|
| 26 |
+
cols = df.filter(regex=f".*{suffix}")
|
| 27 |
+
|
| 28 |
+
# Convert all values to numeric, setting errors='coerce' to convert non-numeric to NaN
|
| 29 |
+
cols = cols.apply(pd.to_numeric, errors='coerce')
|
| 30 |
+
|
| 31 |
+
# Return row-wise mean
|
| 32 |
+
return cols.mean(axis=1)
|
| 33 |
+
|
| 34 |
+
def load_and_process_data(parquet_path):
|
| 35 |
+
"""Load and process the parquet data."""
|
| 36 |
+
# Load data
|
| 37 |
+
df = pd.read_parquet(parquet_path)
|
| 38 |
+
df = df.apply(lambda x: x.replace("Inconnu", None))
|
| 39 |
+
|
| 40 |
+
# Traffic status dictionary
|
| 41 |
+
traffic_status = {
|
| 42 |
+
None: None,
|
| 43 |
+
"Fluide": 0., # freeflow in realtime api
|
| 44 |
+
"Pré-saturé": 1., # heavy in realtime api
|
| 45 |
+
"Saturé": 1., # heavy in realtime api
|
| 46 |
+
"Bloqué": 2. # congested in realtime api
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Replace values in columns ending with 'Etat trafic'
|
| 50 |
+
for col in df.columns:
|
| 51 |
+
if col.endswith("Etat trafic"):
|
| 52 |
+
df[col] = df[col].map(traffic_status)
|
| 53 |
+
|
| 54 |
+
# Create final dataframe
|
| 55 |
+
dict_of_columns = {
|
| 56 |
+
"Timestamp": df["Timestamp"].copy(),
|
| 57 |
+
"Pressure": average_columns_with_suffix(df, "_PSTAT"),
|
| 58 |
+
"Temperature": average_columns_with_suffix(df, "_T"),
|
| 59 |
+
"Wind Speed": average_columns_with_suffix(df, "_FF"),
|
| 60 |
+
"Humidity": average_columns_with_suffix(df, "_U"),
|
| 61 |
+
"Traffic Status": average_columns_with_suffix(df, "_Etat trafic"),
|
| 62 |
+
"NOX": average_columns_with_suffix(df, "NOX"),
|
| 63 |
+
"PM10": average_columns_with_suffix(df, "PM10"),
|
| 64 |
+
"PM25": average_columns_with_suffix(df, "PM25"),
|
| 65 |
+
"O3": average_columns_with_suffix(df, "O3"),
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
final_df = pd.concat(dict_of_columns, axis=1)
|
| 69 |
+
final_df.drop(columns=["Timestamp"], inplace=True)
|
| 70 |
+
final_df.dropna(inplace=True)
|
| 71 |
+
|
| 72 |
+
return final_df
|
| 73 |
+
|
| 74 |
+
def train_and_save_model():
|
| 75 |
+
"""Main function to train and save the model."""
|
| 76 |
+
# Load environment variables
|
| 77 |
+
load_dotenv()
|
| 78 |
+
|
| 79 |
+
# AWS S3 session
|
| 80 |
+
session = boto3.Session(
|
| 81 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
| 82 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
| 83 |
+
region_name=os.getenv("AWS_REGION")
|
| 84 |
+
)
|
| 85 |
+
s3 = session.client('s3')
|
| 86 |
+
|
| 87 |
+
# Configure MLflow
|
| 88 |
+
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
|
| 89 |
+
mlflow.set_experiment("air_quality_prediction")
|
| 90 |
+
|
| 91 |
+
# Enable autologging
|
| 92 |
+
mlflow.sklearn.autolog()
|
| 93 |
+
|
| 94 |
+
with mlflow.start_run():
|
| 95 |
+
# Load and prepare data
|
| 96 |
+
final_df = load_and_process_data("2024_semester2_merged_v2.parquet")
|
| 97 |
+
|
| 98 |
+
# Define features and target
|
| 99 |
+
x_columns = ["Pressure", "Temperature", "Wind Speed", "Humidity", "Traffic Status"]
|
| 100 |
+
y_columns = ["NOX", "PM10", "PM25", "O3"]
|
| 101 |
+
|
| 102 |
+
X = final_df[x_columns].copy()
|
| 103 |
+
y = final_df[y_columns].copy()
|
| 104 |
+
|
| 105 |
+
# Split data
|
| 106 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
|
| 107 |
+
|
| 108 |
+
# Grid search
|
| 109 |
+
param_grid = {
|
| 110 |
+
"n_estimators": [5, 10, 20, 50, 100, 200, 300],
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
base_model = RandomForestRegressor(random_state=42)
|
| 114 |
+
grid_search = GridSearchCV(
|
| 115 |
+
estimator=base_model,
|
| 116 |
+
param_grid=param_grid,
|
| 117 |
+
cv=3,
|
| 118 |
+
n_jobs=-1,
|
| 119 |
+
scoring="r2"
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
model_base_name = "random_forest_grid_search"
|
| 123 |
+
grid_search.fit(X_train, y_train)
|
| 124 |
+
model = grid_search.best_estimator_
|
| 125 |
+
|
| 126 |
+
# Save model
|
| 127 |
+
model_filename = model_base_name + ".pkl"
|
| 128 |
+
model_filename_for_s3 = model_base_name + "_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + ".pkl"
|
| 129 |
+
|
| 130 |
+
# Save locally
|
| 131 |
+
with open(model_filename, "wb") as f:
|
| 132 |
+
pickle.dump(model, f)
|
| 133 |
+
|
| 134 |
+
# Upload to S3
|
| 135 |
+
s3.upload_file(model_filename, "jedha-quality-air", f"models/{model_filename_for_s3}")
|
| 136 |
+
print(f"Model saved to S3 as {model_filename_for_s3}")
|
| 137 |
+
|
| 138 |
+
# Evaluate
|
| 139 |
+
score = model.score(X_test, y_test)
|
| 140 |
+
print(f"\nTest Score: {score:.4f}")
|
| 141 |
+
|
| 142 |
+
# Test prediction
|
| 143 |
+
test_values = {
|
| 144 |
+
"Pressure": 999,
|
| 145 |
+
"Temperature": 22,
|
| 146 |
+
"Wind Speed": 10,
|
| 147 |
+
"Humidity": 50,
|
| 148 |
+
"Traffic Status": 0,
|
| 149 |
+
}
|
| 150 |
+
prediction = model.predict(pd.DataFrame([test_values]))
|
| 151 |
+
print("\nTest prediction:", prediction)
|
| 152 |
+
print("\nModel Parameters:", model.get_params())
|
| 153 |
+
|
| 154 |
+
if __name__ == "__main__":
|
| 155 |
+
train_and_save_model()
|