martper56 commited on
Commit
c65b1a0
·
1 Parent(s): 9acaa60

import training code from main branch

Browse files
{src/ml → app}/jedha_final_project.ipynb RENAMED
File without changes
src/ml/process_for_ml.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
src/ml/requirements.txt CHANGED
@@ -2,4 +2,5 @@ pyarrow
2
  pandas
3
  scikit-learn
4
  mlflow
5
- boto3
 
 
2
  pandas
3
  scikit-learn
4
  mlflow
5
+ boto3
6
+ python-dotenv
src/train_docker/2024_semester2_merged_v2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a46526982934366fbae9da1f4b0abcea000140c6070ae1f2ce966c7eec8869a
3
+ size 1302875
src/train_docker/Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official Python 3.12.11 image as base
2
+ FROM python:3.12.11-slim
3
+
4
+ # Set working directory inside the container
5
+ WORKDIR /src/train_docker
6
+
7
+ # Copy requirements.txt first (for better caching)
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ RUN pip install --upgrade pip && \
12
+ pip install -r requirements.txt
13
+
14
+ # Copy the rest of the application code
15
+ COPY . .
16
+
17
+ # Default command to run your training script
18
+ CMD ["python", "train.py"]
src/train_docker/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pyarrow>=20.0.0
2
+ pandas>=2.3.1
3
+ scikit-learn>=1.7.0
4
+ mlflow>=3.1.1
5
+ boto3>=1.39.3
6
+ python-dotenv
src/train_docker/train.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script is used to train a model and save it to S3.
3
+ It will require the following environment variables:
4
+ - AWS_ACCESS_KEY_ID
5
+ - AWS_SECRET_ACCESS_KEY
6
+ - AWS_REGION
7
+ - MLFLOW_TRACKING_URI
8
+ """
9
+
10
+
11
+ import pandas as pd
12
+ import mlflow
13
+ import mlflow.sklearn
14
+ import pickle
15
+ import boto3
16
+ import datetime
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.ensemble import RandomForestRegressor
19
+ from sklearn.model_selection import GridSearchCV
20
+ import os
21
+ from dotenv import load_dotenv
22
+
23
+ def average_columns_with_suffix(df, suffix):
24
+ """Calculate average of columns with given suffix."""
25
+ # Filter columns matching the suffix
26
+ cols = df.filter(regex=f".*{suffix}")
27
+
28
+ # Convert all values to numeric, setting errors='coerce' to convert non-numeric to NaN
29
+ cols = cols.apply(pd.to_numeric, errors='coerce')
30
+
31
+ # Return row-wise mean
32
+ return cols.mean(axis=1)
33
+
34
+ def load_and_process_data(parquet_path):
35
+ """Load and process the parquet data."""
36
+ # Load data
37
+ df = pd.read_parquet(parquet_path)
38
+ df = df.apply(lambda x: x.replace("Inconnu", None))
39
+
40
+ # Traffic status dictionary
41
+ traffic_status = {
42
+ None: None,
43
+ "Fluide": 0., # freeflow in realtime api
44
+ "Pré-saturé": 1., # heavy in realtime api
45
+ "Saturé": 1., # heavy in realtime api
46
+ "Bloqué": 2. # congested in realtime api
47
+ }
48
+
49
+ # Replace values in columns ending with 'Etat trafic'
50
+ for col in df.columns:
51
+ if col.endswith("Etat trafic"):
52
+ df[col] = df[col].map(traffic_status)
53
+
54
+ # Create final dataframe
55
+ dict_of_columns = {
56
+ "Timestamp": df["Timestamp"].copy(),
57
+ "Pressure": average_columns_with_suffix(df, "_PSTAT"),
58
+ "Temperature": average_columns_with_suffix(df, "_T"),
59
+ "Wind Speed": average_columns_with_suffix(df, "_FF"),
60
+ "Humidity": average_columns_with_suffix(df, "_U"),
61
+ "Traffic Status": average_columns_with_suffix(df, "_Etat trafic"),
62
+ "NOX": average_columns_with_suffix(df, "NOX"),
63
+ "PM10": average_columns_with_suffix(df, "PM10"),
64
+ "PM25": average_columns_with_suffix(df, "PM25"),
65
+ "O3": average_columns_with_suffix(df, "O3"),
66
+ }
67
+
68
+ final_df = pd.concat(dict_of_columns, axis=1)
69
+ final_df.drop(columns=["Timestamp"], inplace=True)
70
+ final_df.dropna(inplace=True)
71
+
72
+ return final_df
73
+
74
+ def train_and_save_model():
75
+ """Main function to train and save the model."""
76
+ # Load environment variables
77
+ load_dotenv()
78
+
79
+ # AWS S3 session
80
+ session = boto3.Session(
81
+ aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
82
+ aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
83
+ region_name=os.getenv("AWS_REGION")
84
+ )
85
+ s3 = session.client('s3')
86
+
87
+ # Configure MLflow
88
+ mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
89
+ mlflow.set_experiment("air_quality_prediction")
90
+
91
+ # Enable autologging
92
+ mlflow.sklearn.autolog()
93
+
94
+ with mlflow.start_run():
95
+ # Load and prepare data
96
+ final_df = load_and_process_data("2024_semester2_merged_v2.parquet")
97
+
98
+ # Define features and target
99
+ x_columns = ["Pressure", "Temperature", "Wind Speed", "Humidity", "Traffic Status"]
100
+ y_columns = ["NOX", "PM10", "PM25", "O3"]
101
+
102
+ X = final_df[x_columns].copy()
103
+ y = final_df[y_columns].copy()
104
+
105
+ # Split data
106
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
107
+
108
+ # Grid search
109
+ param_grid = {
110
+ "n_estimators": [5, 10, 20, 50, 100, 200, 300],
111
+ }
112
+
113
+ base_model = RandomForestRegressor(random_state=42)
114
+ grid_search = GridSearchCV(
115
+ estimator=base_model,
116
+ param_grid=param_grid,
117
+ cv=3,
118
+ n_jobs=-1,
119
+ scoring="r2"
120
+ )
121
+
122
+ model_base_name = "random_forest_grid_search"
123
+ grid_search.fit(X_train, y_train)
124
+ model = grid_search.best_estimator_
125
+
126
+ # Save model
127
+ model_filename = model_base_name + ".pkl"
128
+ model_filename_for_s3 = model_base_name + "_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + ".pkl"
129
+
130
+ # Save locally
131
+ with open(model_filename, "wb") as f:
132
+ pickle.dump(model, f)
133
+
134
+ # Upload to S3
135
+ s3.upload_file(model_filename, "jedha-quality-air", f"models/{model_filename_for_s3}")
136
+ print(f"Model saved to S3 as {model_filename_for_s3}")
137
+
138
+ # Evaluate
139
+ score = model.score(X_test, y_test)
140
+ print(f"\nTest Score: {score:.4f}")
141
+
142
+ # Test prediction
143
+ test_values = {
144
+ "Pressure": 999,
145
+ "Temperature": 22,
146
+ "Wind Speed": 10,
147
+ "Humidity": 50,
148
+ "Traffic Status": 0,
149
+ }
150
+ prediction = model.predict(pd.DataFrame([test_values]))
151
+ print("\nTest prediction:", prediction)
152
+ print("\nModel Parameters:", model.get_params())
153
+
154
+ if __name__ == "__main__":
155
+ train_and_save_model()