Alex LASNIER commited on
Commit
26a413c
·
unverified ·
1 Parent(s): 0bebcc8

creation du dataset avec une ligne par timestamp

Browse files

creation du dataset avec une ligne par timestamp, inclut meteo, traffic et polluants

Files changed (1) hide show
  1. app/jedha_final_project.ipynb +580 -0
app/jedha_final_project.ipynb ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "markdown",
6
+ "source": "# Libs",
7
+ "id": "dae9db5e62cec5e9"
8
+ },
9
+ {
10
+ "cell_type": "code",
11
+ "id": "initial_id",
12
+ "metadata": {
13
+ "collapsed": true,
14
+ "ExecuteTime": {
15
+ "end_time": "2025-07-09T19:43:39.841918Z",
16
+ "start_time": "2025-07-09T19:43:39.401113Z"
17
+ }
18
+ },
19
+ "source": [
20
+ "import os\n",
21
+ "\n",
22
+ "import boto3\n",
23
+ "import pandas as pd\n",
24
+ "# Charger les variables\n",
25
+ "from dotenv import load_dotenv\n"
26
+ ],
27
+ "outputs": [],
28
+ "execution_count": 1
29
+ },
30
+ {
31
+ "metadata": {},
32
+ "cell_type": "markdown",
33
+ "source": "# All",
34
+ "id": "8c0c6c3d85f13653"
35
+ },
36
+ {
37
+ "metadata": {
38
+ "ExecuteTime": {
39
+ "end_time": "2025-07-09T19:38:42.289222Z",
40
+ "start_time": "2025-07-09T19:38:16.883228Z"
41
+ }
42
+ },
43
+ "cell_type": "code",
44
+ "source": [
45
+ "# df_traffic = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/comptages-routiers-permanents.csv',\n",
46
+ "# sep=';', on_bad_lines='skip')\n",
47
+ "# df_nox = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_NOX.csv', sep=',', on_bad_lines='skip')\n",
48
+ "# df_O3 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_O3.csv', sep=',', on_bad_lines='skip')\n",
49
+ "# df_pm10 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_pm10.csv', sep=',', on_bad_lines='skip')\n",
50
+ "# df_pm25 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_pm25.csv', sep=',', on_bad_lines='skip')\n",
51
+ "# df_meteo = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/H_75_latest-2024-2025.csv', sep=';')\n"
52
+ ],
53
+ "id": "96738dbb6b0524b6",
54
+ "outputs": [],
55
+ "execution_count": 2
56
+ },
57
+ {
58
+ "metadata": {},
59
+ "cell_type": "markdown",
60
+ "source": "# Meteo",
61
+ "id": "8a0a89e2100fc626"
62
+ },
63
+ {
64
+ "metadata": {},
65
+ "cell_type": "markdown",
66
+ "source": "## Clean",
67
+ "id": "84ec54a1e60f633"
68
+ },
69
+ {
70
+ "metadata": {},
71
+ "cell_type": "code",
72
+ "source": [
73
+ "# Convertir en format Date et renommer la colonne AAAAMMJJHH\n",
74
+ "df_meteo['AAAAMMJJHH'] = pd.to_datetime(df_meteo[\"AAAAMMJJHH\"], format=\"%Y%m%d%H\", utc=True)\n",
75
+ "df_meteo = df_meteo.rename(columns={\"AAAAMMJJHH\": \"Timestamp\"})\n",
76
+ "\n",
77
+ "# Supprimer toutes les colonnes où toutes les valeurs sont NaN\n",
78
+ "# Permet de passer de 204 colonnes a 98\n",
79
+ "df_meteo = df_meteo.dropna(how=\"all\", axis=1)\n",
80
+ "\n",
81
+ "# Supprimer les lignes où \"PARIS-MONTSOURIS-DOUBLE\" est dans la colonne \"NOM_USUEL\"\n",
82
+ "# Permet de passer de 80 k columns a 65 k\n",
83
+ "df_meteo = df_meteo[~df_meteo['NOM_USUEL'].str.contains(\"PARIS-MONTSOURIS-DOUBLE\", na=False)]\n",
84
+ "\n",
85
+ "df_meteo.reset_index(inplace=True)\n",
86
+ "df_meteo = df_meteo.sort_values(by=['Timestamp'])"
87
+ ],
88
+ "id": "11f81e08321616c7",
89
+ "outputs": [],
90
+ "execution_count": null
91
+ },
92
+ {
93
+ "metadata": {},
94
+ "cell_type": "markdown",
95
+ "source": "## Pivot",
96
+ "id": "c4c59f29f647cd51"
97
+ },
98
+ {
99
+ "metadata": {},
100
+ "cell_type": "code",
101
+ "source": [
102
+ "# Pivoter le DataFrame\n",
103
+ "df_meteo_pivoted = df_meteo.set_index(['Timestamp', 'NOM_USUEL']).unstack()\n",
104
+ "df_meteo_pivoted = df_meteo_pivoted.drop(['index', 'NUM_POSTE', 'LAT', 'LON', 'ALTI'], axis=1)\n",
105
+ "\n",
106
+ "df_meteo_pivoted.columns = [f\"{station}_{var}\" for var, station in df_meteo_pivoted.columns]\n",
107
+ "df_meteo_pivoted = df_meteo_pivoted.reset_index()\n",
108
+ "\n",
109
+ "# Extraire les identifiants de station uniques\n",
110
+ "station_ids = sorted({col.split('_')[0] for col in df_meteo_pivoted.columns if '_' in col})\n",
111
+ "\n",
112
+ "# Réorganiser les colonnes\n",
113
+ "sorted_columns = ['Timestamp'] + [col for station in station_ids for col in df_meteo_pivoted.columns if\n",
114
+ " col.startswith(station)]\n",
115
+ "\n",
116
+ "# Réorganiser le DataFrame\n",
117
+ "df_meteo_pivoted = df_meteo_pivoted[sorted_columns]\n",
118
+ "\n",
119
+ "df_meteo_pivoted"
120
+ ],
121
+ "id": "a0d4f42370a2cdca",
122
+ "outputs": [],
123
+ "execution_count": null
124
+ },
125
+ {
126
+ "metadata": {},
127
+ "cell_type": "code",
128
+ "source": [
129
+ "df_meteo_pivoted.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet',\n",
130
+ " index=False)"
131
+ ],
132
+ "id": "196b3e20978976ec",
133
+ "outputs": [],
134
+ "execution_count": null
135
+ },
136
+ {
137
+ "metadata": {},
138
+ "cell_type": "markdown",
139
+ "source": "# Pollutants",
140
+ "id": "c075b0ecc2339caa"
141
+ },
142
+ {
143
+ "metadata": {},
144
+ "cell_type": "markdown",
145
+ "source": "## Clean",
146
+ "id": "f0ea5ee496220a8c"
147
+ },
148
+ {
149
+ "metadata": {},
150
+ "cell_type": "code",
151
+ "source": [
152
+ "##################################################\n",
153
+ "# NOX\n",
154
+ "##################################################\n",
155
+ "\n",
156
+ "# Rename col Unnamed: 0 et convertir en format Date\n",
157
+ "df_nox = df_nox.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
158
+ "df_nox['Timestamp'] = pd.to_datetime(df_nox[\"Timestamp\"], utc=True)\n",
159
+ "\n",
160
+ "# 8 800 ; 40 columns vers 7 columns\n",
161
+ "# Liste des chaînes à rechercher dans les noms de colonnes\n",
162
+ "colonnes_a_garder = ['Timestamp', 'PA18', 'EIFF3', 'PA13', 'NEUIL', 'BONAP']\n",
163
+ "\n",
164
+ "# Filtrer les colonnes du DataFrame\n",
165
+ "df_nox = df_nox.loc[:,\n",
166
+ " df_nox.columns.isin(colonnes_a_garder) | df_nox.columns.str.contains('|'.join(colonnes_a_garder))]\n",
167
+ "\n",
168
+ "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
169
+ "df_nox = df_nox.dropna(subset=['Timestamp'])\n",
170
+ "\n",
171
+ "# df_nox.reset_index(inplace=True)\n",
172
+ "df_nox = df_nox.sort_values(by=['Timestamp'])\n",
173
+ "\n",
174
+ "##################################################\n",
175
+ "# O3\n",
176
+ "##################################################\n",
177
+ "# Rename col Unnamed: 0 et convertir en format Date\n",
178
+ "df_O3 = df_O3.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
179
+ "df_O3['Timestamp'] = pd.to_datetime(df_O3[\"Timestamp\"], utc=True)\n",
180
+ "\n",
181
+ "# Liste des chaînes à rechercher dans les noms de colonnes\n",
182
+ "colonnes_a_garder = ['Timestamp', 'PA18', 'EIFF3', 'PA13', 'NEUIL', 'PA01H']\n",
183
+ "\n",
184
+ "# Filtrer les colonnes du DataFrame\n",
185
+ "df_O3 = df_O3.loc[:, df_O3.columns.isin(colonnes_a_garder) | df_O3.columns.str.contains('|'.join(colonnes_a_garder))]\n",
186
+ "\n",
187
+ "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
188
+ "df_O3 = df_O3.dropna(subset=['Timestamp'])\n",
189
+ "\n",
190
+ "# df_O3.reset_index(inplace=True)\n",
191
+ "df_O3 = df_O3.sort_values(by=['Timestamp'])\n",
192
+ "\n",
193
+ "##################################################\n",
194
+ "# pm10\n",
195
+ "##################################################\n",
196
+ "# Rename col Unnamed: 0 et convertir en format Date\n",
197
+ "df_pm10 = df_pm10.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
198
+ "df_pm10['Timestamp'] = pd.to_datetime(df_pm10[\"Timestamp\"], utc=True)\n",
199
+ "\n",
200
+ "# Liste des chaînes à rechercher dans les noms de colonnes\n",
201
+ "colonnes_a_garder = ['Timestamp', 'PA18', 'ELYS', 'BASCH', 'AUT', 'PA01H']\n",
202
+ "\n",
203
+ "# Filtrer les colonnes du DataFrame\n",
204
+ "df_pm10 = df_pm10.loc[:,\n",
205
+ " df_pm10.columns.isin(colonnes_a_garder) | df_pm10.columns.str.contains('|'.join(colonnes_a_garder))]\n",
206
+ "\n",
207
+ "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
208
+ "df_pm10 = df_pm10.dropna(subset=['Timestamp'])\n",
209
+ "\n",
210
+ "# df_pm10.reset_index(inplace=True)\n",
211
+ "df_pm10 = df_pm10.sort_values(by=['Timestamp'])\n",
212
+ "\n",
213
+ "##################################################\n",
214
+ "# pm25\n",
215
+ "##################################################\n",
216
+ "# Rename col Unnamed: 0 et convertir en format Date\n",
217
+ "df_pm25 = df_pm25.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
218
+ "df_pm25['Timestamp'] = pd.to_datetime(df_pm25[\"Timestamp\"], utc=True)\n",
219
+ "\n",
220
+ "# Liste des chaînes à rechercher dans les noms de colonnes\n",
221
+ "colonnes_a_garder = ['Timestamp', 'PA18', 'ELYS', 'AUT', 'PA01H']\n",
222
+ "\n",
223
+ "# Filtrer les colonnes du DataFrame\n",
224
+ "df_pm25 = df_pm25.loc[:,\n",
225
+ " df_pm25.columns.isin(colonnes_a_garder) | df_pm25.columns.str.contains('|'.join(colonnes_a_garder))]\n",
226
+ "\n",
227
+ "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
228
+ "df_pm25 = df_pm25.dropna(subset=['Timestamp'])\n",
229
+ "\n",
230
+ "# df_pm25.reset_index(inplace=True)\n",
231
+ "df_pm25 = df_pm25.sort_values(by=['Timestamp'])\n"
232
+ ],
233
+ "id": "20e9485dea763097",
234
+ "outputs": [],
235
+ "execution_count": null
236
+ },
237
+ {
238
+ "metadata": {},
239
+ "cell_type": "markdown",
240
+ "source": "## Merge",
241
+ "id": "96cf48a9f7521fcd"
242
+ },
243
+ {
244
+ "metadata": {},
245
+ "cell_type": "code",
246
+ "source": [
247
+ "df_merged = pd.merge_asof(df_nox,\n",
248
+ " df_O3,\n",
249
+ " left_on='Timestamp',\n",
250
+ " right_on='Timestamp',\n",
251
+ " direction='nearest')\n",
252
+ "\n",
253
+ "df_merged = pd.merge_asof(df_merged,\n",
254
+ " df_pm10,\n",
255
+ " left_on='Timestamp',\n",
256
+ " right_on='Timestamp',\n",
257
+ " direction='nearest')\n",
258
+ "\n",
259
+ "df_merged = pd.merge_asof(df_merged,\n",
260
+ " df_pm25,\n",
261
+ " left_on='Timestamp',\n",
262
+ " right_on='Timestamp',\n",
263
+ " direction='nearest')\n"
264
+ ],
265
+ "id": "2db2ed91c9efda4b",
266
+ "outputs": [],
267
+ "execution_count": null
268
+ },
269
+ {
270
+ "metadata": {},
271
+ "cell_type": "markdown",
272
+ "source": "## Extract",
273
+ "id": "f13105d20628b7b0"
274
+ },
275
+ {
276
+ "metadata": {},
277
+ "cell_type": "code",
278
+ "source": [
279
+ "df_merged.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/df_pollutants_cleaned_pivoted.parquet',\n",
280
+ " index=False)\n"
281
+ ],
282
+ "id": "eaccdaee3f90298a",
283
+ "outputs": [],
284
+ "execution_count": null
285
+ },
286
+ {
287
+ "metadata": {},
288
+ "cell_type": "markdown",
289
+ "source": "# Traffic",
290
+ "id": "bb30b5e28f65c9bc"
291
+ },
292
+ {
293
+ "metadata": {},
294
+ "cell_type": "code",
295
+ "source": [
296
+ "# Convertir la colonne \"Date et heure de comptage\" en format Date\n",
297
+ "df_traffic['Date et heure de comptage'] = pd.to_datetime(df_traffic[\"Date et heure de comptage\"], utc=True)\n",
298
+ "df_traffic = df_traffic.rename(columns={\"Date et heure de comptage\": \"Timestamp\"})"
299
+ ],
300
+ "id": "de7fc1da2bf02136",
301
+ "outputs": [],
302
+ "execution_count": null
303
+ },
304
+ {
305
+ "metadata": {},
306
+ "cell_type": "markdown",
307
+ "source": "## Clean",
308
+ "id": "126558e93cf2c2a6"
309
+ },
310
+ {
311
+ "metadata": {
312
+ "ExecuteTime": {
313
+ "end_time": "2025-07-09T19:38:46.212441Z",
314
+ "start_time": "2025-07-09T19:38:42.317055Z"
315
+ }
316
+ },
317
+ "cell_type": "code",
318
+ "source": [
319
+ "# Convertir la colonne \"Date et heure de comptage\" en format Date\n",
320
+ "df_traffic['Date et heure de comptage'] = pd.to_datetime(df_traffic[\"Date et heure de comptage\"], utc=True)\n",
321
+ "df_traffic = df_traffic.rename(columns={\"Date et heure de comptage\": \"Timestamp\"})\n",
322
+ "\n",
323
+ "# Filtrer les lignes contenant certaines valeurs dans la colonne \"Identifiant arc\"\n",
324
+ "ids = [1572, 1573, 4434, 4440, 728, 737, 5442, 5455, 615, 616]\n",
325
+ "\n",
326
+ "# Filtrer uniquement sur les identifiants\n",
327
+ "df_traffic = df_traffic[df_traffic['Identifiant arc'].isin(ids)]\n",
328
+ "\n",
329
+ "df_traffic = df_traffic.sort_values(by=['Timestamp'])"
330
+ ],
331
+ "id": "9c0ea39992c0f566",
332
+ "outputs": [],
333
+ "execution_count": 3
334
+ },
335
+ {
336
+ "metadata": {},
337
+ "cell_type": "code",
338
+ "source": [
339
+ "# Création d’un identifiant unique par site\n",
340
+ "df_traffic_filtered[\"ID_Libelle\"] = df_traffic_filtered[\"Identifiant arc\"].astype(str) + \"_\" + df_traffic_filtered[\"Libelle\"]\n",
341
+ "\n",
342
+ "df_traffic_filtered_pivoted = df_traffic_filtered.set_index(['Timestamp', 'ID_Libelle']).unstack()\n",
343
+ "\n",
344
+ "df_traffic_filtered_pivoted.columns = [f\"{station}_{var}\" for var, station in df_traffic_filtered_pivoted.columns]\n",
345
+ "df_traffic_filtered_pivoted = df_traffic_filtered_pivoted.reset_index()\n",
346
+ "\n",
347
+ "# Extraire les identifiants de station uniques\n",
348
+ "ids_libelles = sorted({col.split('_')[0] for col in df_meteo_pivoted.columns if '_' in col})\n",
349
+ "\n",
350
+ "# Réorganiser les colonnes\n",
351
+ "sorted_columns = ['Timestamp'] + [col for station in station_ids for col in df_meteo_pivoted.columns if\n",
352
+ " col.startswith(station)]\n",
353
+ "\n",
354
+ "# Réorganiser le DataFrame\n",
355
+ "df_meteo_pivoted = df_meteo_pivoted[sorted_columns]\n",
356
+ "\n",
357
+ "\n",
358
+ "# # On \"pivot\" le DataFrame pour avoir une seule ligne par timestamp\n",
359
+ "# df_traffic_filtered_pivot = df_traffic_filtered.melt(id_vars=[\"Timestamp\", \"site_id\"],\n",
360
+ "# value_vars=[col for col in df_traffic_filtered.columns if col not in [\"Timestamp\", \"code_site\", \"Libelle\", \"Identifiant arc\"]],\n",
361
+ "# var_name=\"variable\", value_name=\"valeur\")\n",
362
+ "#\n",
363
+ "# # Création des noms de colonnes finaux\n",
364
+ "# df_traffic_filtered_pivot[\"colonne_finale\"] = df_traffic_filtered_pivot[\"site_id\"] + \"_\" + df_traffic_filtered_pivot[\"variable\"]\n",
365
+ "#\n",
366
+ "# # Restructuration du tableau\n",
367
+ "# df_traffic_filtered_final = df_traffic_filtered_pivot.pivot_table(index=\"Timestamp\", columns=\"colonne_finale\", values=\"valeur\").reset_index()\n"
368
+ ],
369
+ "id": "af9f5b3120eeb1d",
370
+ "outputs": [],
371
+ "execution_count": null
372
+ },
373
+ {
374
+ "metadata": {},
375
+ "cell_type": "code",
376
+ "source": [
377
+ "# Création d’un identifiant unique par site\n",
378
+ "df_traffic[\"ID_Libelle\"] = df_traffic[\"Identifiant arc\"].astype(str) + \"_\" + df_traffic[\n",
379
+ " \"Libelle\"]\n",
380
+ "df_traffic = df_traffic.drop(['Identifiant arc', 'Libelle'], axis=1)\n",
381
+ "\n",
382
+ "# Pivoter le DataFrame\n",
383
+ "df_traffic_pivoted = df_traffic.set_index(['Timestamp', 'ID_Libelle']).unstack()"
384
+ ],
385
+ "id": "70ccccef23c73b1b",
386
+ "outputs": [],
387
+ "execution_count": null
388
+ },
389
+ {
390
+ "metadata": {},
391
+ "cell_type": "code",
392
+ "source": "df_traffic_pivoted.columns",
393
+ "id": "d37d22c8734776fe",
394
+ "outputs": [],
395
+ "execution_count": null
396
+ },
397
+ {
398
+ "metadata": {},
399
+ "cell_type": "code",
400
+ "source": [
401
+ "\n",
402
+ "df_traffic_pivoted.columns = [f\"{station}_{var}\" for var, station in df_traffic_pivoted.columns]\n",
403
+ "df_traffic_pivoted = df_traffic_pivoted.reset_index()"
404
+ ],
405
+ "id": "e911bbf7d54cf3c8",
406
+ "outputs": [],
407
+ "execution_count": null
408
+ },
409
+ {
410
+ "metadata": {},
411
+ "cell_type": "code",
412
+ "source": [
413
+ "\n",
414
+ "# Extraire les identifiants de station uniques sans couper à chaque underscore\n",
415
+ "ids_libelles = sorted({col.rsplit('_', 1)[0] for col in df_traffic_pivoted.columns if col != 'Timestamp'})\n",
416
+ "\n",
417
+ "# Réorganiser les colonnes par station\n",
418
+ "sorted_columns = ['Timestamp'] + [\n",
419
+ " col for station in ids_libelles\n",
420
+ " for col in df_traffic_pivoted.columns\n",
421
+ " if col.startswith(station + \"_\")\n",
422
+ "]\n",
423
+ "\n",
424
+ "df_traffic_pivoted = df_traffic_pivoted[sorted_columns]\n"
425
+ ],
426
+ "id": "ad16251433d93b49",
427
+ "outputs": [],
428
+ "execution_count": null
429
+ },
430
+ {
431
+ "metadata": {},
432
+ "cell_type": "markdown",
433
+ "source": "## Extract",
434
+ "id": "973e0774ef72f46"
435
+ },
436
+ {
437
+ "metadata": {},
438
+ "cell_type": "code",
439
+ "source": [
440
+ "from collections import Counter\n",
441
+ "\n",
442
+ "# Liste des colonnes en double\n",
443
+ "col_counts = Counter(df_traffic_pivoted.columns)\n",
444
+ "duplicate_cols = [col for col, count in col_counts.items() if count > 1]\n",
445
+ "\n",
446
+ "print(\"Colonnes dupliquées :\", duplicate_cols)\n"
447
+ ],
448
+ "id": "c67fce5edffdd474",
449
+ "outputs": [],
450
+ "execution_count": null
451
+ },
452
+ {
453
+ "metadata": {},
454
+ "cell_type": "code",
455
+ "source": "df_traffic_pivoted = df_traffic_pivoted.loc[:, ~df_traffic_pivoted.columns.duplicated()]\n",
456
+ "id": "7f1085cb636fca55",
457
+ "outputs": [],
458
+ "execution_count": null
459
+ },
460
+ {
461
+ "metadata": {},
462
+ "cell_type": "code",
463
+ "source": [
464
+ "df_traffic_pivoted.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/traffic_cleaned_pivoted.parquet',\n",
465
+ " index=False)"
466
+ ],
467
+ "id": "c9d8fc584837b7cb",
468
+ "outputs": [],
469
+ "execution_count": null
470
+ },
471
+ {
472
+ "metadata": {},
473
+ "cell_type": "code",
474
+ "source": "df_traffic_pivoted.shape",
475
+ "id": "bc51ff55f46b1d09",
476
+ "outputs": [],
477
+ "execution_count": null
478
+ },
479
+ {
480
+ "metadata": {},
481
+ "cell_type": "markdown",
482
+ "source": "# Merge final",
483
+ "id": "9971bbc11c27dbdf"
484
+ },
485
+ {
486
+ "metadata": {
487
+ "ExecuteTime": {
488
+ "end_time": "2025-07-09T19:44:09.673695Z",
489
+ "start_time": "2025-07-09T19:44:09.463956Z"
490
+ }
491
+ },
492
+ "cell_type": "code",
493
+ "source": [
494
+ "df_traffic = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/traffic_cleaned_pivoted.parquet')\n",
495
+ "df_meteo = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet')\n",
496
+ "df_pollutants = pd.read_parquet(\n",
497
+ " '/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/pollutants_cleaned_pivoted.parquet')\n",
498
+ "\n",
499
+ "df_traffic = df_traffic.sort_values(by=['Timestamp'])\n",
500
+ "df_meteo = df_meteo.sort_values(by=['Timestamp'])\n",
501
+ "df_pollutants = df_pollutants.sort_values(by=['Timestamp'])\n",
502
+ "\n",
503
+ "# Merge on the nearest time values\n",
504
+ "df_merged = pd.merge_asof(df_traffic,\n",
505
+ " df_meteo,\n",
506
+ " left_on='Timestamp',\n",
507
+ " right_on='Timestamp',\n",
508
+ " direction='nearest')\n",
509
+ "\n",
510
+ "df_merged = pd.merge_asof(df_merged,\n",
511
+ " df_pollutants,\n",
512
+ " left_on='Timestamp',\n",
513
+ " right_on='Timestamp',\n",
514
+ " direction='nearest')\n",
515
+ "\n",
516
+ "df_merged = df_merged.sort_values(by=['Timestamp'])"
517
+ ],
518
+ "id": "ed106c330d7fe155",
519
+ "outputs": [],
520
+ "execution_count": 2
521
+ },
522
+ {
523
+ "metadata": {},
524
+ "cell_type": "markdown",
525
+ "source": "# Upload to S3",
526
+ "id": "72d1e27b43e8f51"
527
+ },
528
+ {
529
+ "metadata": {
530
+ "ExecuteTime": {
531
+ "end_time": "2025-07-09T19:44:54.064492Z",
532
+ "start_time": "2025-07-09T19:44:53.921133Z"
533
+ }
534
+ },
535
+ "cell_type": "code",
536
+ "source": "df_merged.to_parquet('2024_semester2_merged_v2.parquet', engine='pyarrow')\n",
537
+ "id": "c5f2ca648dc532e0",
538
+ "outputs": [],
539
+ "execution_count": 3
540
+ },
541
+ {
542
+ "metadata": {},
543
+ "cell_type": "markdown",
544
+ "source": "# CURIOSITY",
545
+ "id": "e83dca08dee6a881"
546
+ },
547
+ {
548
+ "metadata": {},
549
+ "cell_type": "code",
550
+ "source": [
551
+ "meteo = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet')\n",
552
+ "pollutants = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/pollutants_cleaned_pivoted.parquet')"
553
+ ],
554
+ "id": "346726ba01317db",
555
+ "outputs": [],
556
+ "execution_count": null
557
+ }
558
+ ],
559
+ "metadata": {
560
+ "kernelspec": {
561
+ "display_name": "Python 3",
562
+ "language": "python",
563
+ "name": "python3"
564
+ },
565
+ "language_info": {
566
+ "codemirror_mode": {
567
+ "name": "ipython",
568
+ "version": 2
569
+ },
570
+ "file_extension": ".py",
571
+ "mimetype": "text/x-python",
572
+ "name": "python",
573
+ "nbconvert_exporter": "python",
574
+ "pygments_lexer": "ipython2",
575
+ "version": "2.7.6"
576
+ }
577
+ },
578
+ "nbformat": 4,
579
+ "nbformat_minor": 5
580
+ }