Alquilar78 commited on
Commit
7b61a9b
·
1 Parent(s): b7ff5e0

Suppression de fichiers obsolètes + modif requirements

Browse files
airflow/logs/test DELETED
File without changes
airflow/plugins/test DELETED
File without changes
app/jedha_final_project.ipynb DELETED
@@ -1,580 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "metadata": {},
5
- "cell_type": "markdown",
6
- "source": "# Libs",
7
- "id": "dae9db5e62cec5e9"
8
- },
9
- {
10
- "cell_type": "code",
11
- "id": "initial_id",
12
- "metadata": {
13
- "collapsed": true,
14
- "ExecuteTime": {
15
- "end_time": "2025-07-09T19:43:39.841918Z",
16
- "start_time": "2025-07-09T19:43:39.401113Z"
17
- }
18
- },
19
- "source": [
20
- "import os\n",
21
- "\n",
22
- "import boto3\n",
23
- "import pandas as pd\n",
24
- "# Charger les variables\n",
25
- "from dotenv import load_dotenv\n"
26
- ],
27
- "outputs": [],
28
- "execution_count": 1
29
- },
30
- {
31
- "metadata": {},
32
- "cell_type": "markdown",
33
- "source": "# All",
34
- "id": "8c0c6c3d85f13653"
35
- },
36
- {
37
- "metadata": {
38
- "ExecuteTime": {
39
- "end_time": "2025-07-09T19:38:42.289222Z",
40
- "start_time": "2025-07-09T19:38:16.883228Z"
41
- }
42
- },
43
- "cell_type": "code",
44
- "source": [
45
- "# df_traffic = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/comptages-routiers-permanents.csv',\n",
46
- "# sep=';', on_bad_lines='skip')\n",
47
- "# df_nox = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_NOX.csv', sep=',', on_bad_lines='skip')\n",
48
- "# df_O3 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_O3.csv', sep=',', on_bad_lines='skip')\n",
49
- "# df_pm10 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_pm10.csv', sep=',', on_bad_lines='skip')\n",
50
- "# df_pm25 = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/2024_pm25.csv', sep=',', on_bad_lines='skip')\n",
51
- "# df_meteo = pd.read_csv('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/H_75_latest-2024-2025.csv', sep=';')\n"
52
- ],
53
- "id": "96738dbb6b0524b6",
54
- "outputs": [],
55
- "execution_count": 2
56
- },
57
- {
58
- "metadata": {},
59
- "cell_type": "markdown",
60
- "source": "# Meteo",
61
- "id": "8a0a89e2100fc626"
62
- },
63
- {
64
- "metadata": {},
65
- "cell_type": "markdown",
66
- "source": "## Clean",
67
- "id": "84ec54a1e60f633"
68
- },
69
- {
70
- "metadata": {},
71
- "cell_type": "code",
72
- "source": [
73
- "# Convertir en format Date et renommer la colonne AAAAMMJJHH\n",
74
- "df_meteo['AAAAMMJJHH'] = pd.to_datetime(df_meteo[\"AAAAMMJJHH\"], format=\"%Y%m%d%H\", utc=True)\n",
75
- "df_meteo = df_meteo.rename(columns={\"AAAAMMJJHH\": \"Timestamp\"})\n",
76
- "\n",
77
- "# Supprimer toutes les colonnes où toutes les valeurs sont NaN\n",
78
- "# Permet de passer de 204 colonnes a 98\n",
79
- "df_meteo = df_meteo.dropna(how=\"all\", axis=1)\n",
80
- "\n",
81
- "# Supprimer les lignes où \"PARIS-MONTSOURIS-DOUBLE\" est dans la colonne \"NOM_USUEL\"\n",
82
- "# Permet de passer de 80 k columns a 65 k\n",
83
- "df_meteo = df_meteo[~df_meteo['NOM_USUEL'].str.contains(\"PARIS-MONTSOURIS-DOUBLE\", na=False)]\n",
84
- "\n",
85
- "df_meteo.reset_index(inplace=True)\n",
86
- "df_meteo = df_meteo.sort_values(by=['Timestamp'])"
87
- ],
88
- "id": "11f81e08321616c7",
89
- "outputs": [],
90
- "execution_count": null
91
- },
92
- {
93
- "metadata": {},
94
- "cell_type": "markdown",
95
- "source": "## Pivot",
96
- "id": "c4c59f29f647cd51"
97
- },
98
- {
99
- "metadata": {},
100
- "cell_type": "code",
101
- "source": [
102
- "# Pivoter le DataFrame\n",
103
- "df_meteo_pivoted = df_meteo.set_index(['Timestamp', 'NOM_USUEL']).unstack()\n",
104
- "df_meteo_pivoted = df_meteo_pivoted.drop(['index', 'NUM_POSTE', 'LAT', 'LON', 'ALTI'], axis=1)\n",
105
- "\n",
106
- "df_meteo_pivoted.columns = [f\"{station}_{var}\" for var, station in df_meteo_pivoted.columns]\n",
107
- "df_meteo_pivoted = df_meteo_pivoted.reset_index()\n",
108
- "\n",
109
- "# Extraire les identifiants de station uniques\n",
110
- "station_ids = sorted({col.split('_')[0] for col in df_meteo_pivoted.columns if '_' in col})\n",
111
- "\n",
112
- "# Réorganiser les colonnes\n",
113
- "sorted_columns = ['Timestamp'] + [col for station in station_ids for col in df_meteo_pivoted.columns if\n",
114
- " col.startswith(station)]\n",
115
- "\n",
116
- "# Réorganiser le DataFrame\n",
117
- "df_meteo_pivoted = df_meteo_pivoted[sorted_columns]\n",
118
- "\n",
119
- "df_meteo_pivoted"
120
- ],
121
- "id": "a0d4f42370a2cdca",
122
- "outputs": [],
123
- "execution_count": null
124
- },
125
- {
126
- "metadata": {},
127
- "cell_type": "code",
128
- "source": [
129
- "df_meteo_pivoted.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet',\n",
130
- " index=False)"
131
- ],
132
- "id": "196b3e20978976ec",
133
- "outputs": [],
134
- "execution_count": null
135
- },
136
- {
137
- "metadata": {},
138
- "cell_type": "markdown",
139
- "source": "# Pollutants",
140
- "id": "c075b0ecc2339caa"
141
- },
142
- {
143
- "metadata": {},
144
- "cell_type": "markdown",
145
- "source": "## Clean",
146
- "id": "f0ea5ee496220a8c"
147
- },
148
- {
149
- "metadata": {},
150
- "cell_type": "code",
151
- "source": [
152
- "##################################################\n",
153
- "# NOX\n",
154
- "##################################################\n",
155
- "\n",
156
- "# Rename col Unnamed: 0 et convertir en format Date\n",
157
- "df_nox = df_nox.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
158
- "df_nox['Timestamp'] = pd.to_datetime(df_nox[\"Timestamp\"], utc=True)\n",
159
- "\n",
160
- "# 8 800 ; 40 columns vers 7 columns\n",
161
- "# Liste des chaînes à rechercher dans les noms de colonnes\n",
162
- "colonnes_a_garder = ['Timestamp', 'PA18', 'EIFF3', 'PA13', 'NEUIL', 'BONAP']\n",
163
- "\n",
164
- "# Filtrer les colonnes du DataFrame\n",
165
- "df_nox = df_nox.loc[:,\n",
166
- " df_nox.columns.isin(colonnes_a_garder) | df_nox.columns.str.contains('|'.join(colonnes_a_garder))]\n",
167
- "\n",
168
- "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
169
- "df_nox = df_nox.dropna(subset=['Timestamp'])\n",
170
- "\n",
171
- "# df_nox.reset_index(inplace=True)\n",
172
- "df_nox = df_nox.sort_values(by=['Timestamp'])\n",
173
- "\n",
174
- "##################################################\n",
175
- "# O3\n",
176
- "##################################################\n",
177
- "# Rename col Unnamed: 0 et convertir en format Date\n",
178
- "df_O3 = df_O3.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
179
- "df_O3['Timestamp'] = pd.to_datetime(df_O3[\"Timestamp\"], utc=True)\n",
180
- "\n",
181
- "# Liste des chaînes à rechercher dans les noms de colonnes\n",
182
- "colonnes_a_garder = ['Timestamp', 'PA18', 'EIFF3', 'PA13', 'NEUIL', 'PA01H']\n",
183
- "\n",
184
- "# Filtrer les colonnes du DataFrame\n",
185
- "df_O3 = df_O3.loc[:, df_O3.columns.isin(colonnes_a_garder) | df_O3.columns.str.contains('|'.join(colonnes_a_garder))]\n",
186
- "\n",
187
- "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
188
- "df_O3 = df_O3.dropna(subset=['Timestamp'])\n",
189
- "\n",
190
- "# df_O3.reset_index(inplace=True)\n",
191
- "df_O3 = df_O3.sort_values(by=['Timestamp'])\n",
192
- "\n",
193
- "##################################################\n",
194
- "# pm10\n",
195
- "##################################################\n",
196
- "# Rename col Unnamed: 0 et convertir en format Date\n",
197
- "df_pm10 = df_pm10.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
198
- "df_pm10['Timestamp'] = pd.to_datetime(df_pm10[\"Timestamp\"], utc=True)\n",
199
- "\n",
200
- "# Liste des chaînes à rechercher dans les noms de colonnes\n",
201
- "colonnes_a_garder = ['Timestamp', 'PA18', 'ELYS', 'BASCH', 'AUT', 'PA01H']\n",
202
- "\n",
203
- "# Filtrer les colonnes du DataFrame\n",
204
- "df_pm10 = df_pm10.loc[:,\n",
205
- " df_pm10.columns.isin(colonnes_a_garder) | df_pm10.columns.str.contains('|'.join(colonnes_a_garder))]\n",
206
- "\n",
207
- "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
208
- "df_pm10 = df_pm10.dropna(subset=['Timestamp'])\n",
209
- "\n",
210
- "# df_pm10.reset_index(inplace=True)\n",
211
- "df_pm10 = df_pm10.sort_values(by=['Timestamp'])\n",
212
- "\n",
213
- "##################################################\n",
214
- "# pm25\n",
215
- "##################################################\n",
216
- "# Rename col Unnamed: 0 et convertir en format Date\n",
217
- "df_pm25 = df_pm25.rename(columns={\"Unnamed: 0\": \"Timestamp\"})\n",
218
- "df_pm25['Timestamp'] = pd.to_datetime(df_pm25[\"Timestamp\"], utc=True)\n",
219
- "\n",
220
- "# Liste des chaînes à rechercher dans les noms de colonnes\n",
221
- "colonnes_a_garder = ['Timestamp', 'PA18', 'ELYS', 'AUT', 'PA01H']\n",
222
- "\n",
223
- "# Filtrer les colonnes du DataFrame\n",
224
- "df_pm25 = df_pm25.loc[:,\n",
225
- " df_pm25.columns.isin(colonnes_a_garder) | df_pm25.columns.str.contains('|'.join(colonnes_a_garder))]\n",
226
- "\n",
227
- "# Supprimer les lignes contenant NaN dans la colonne \"Timestamp\"\n",
228
- "df_pm25 = df_pm25.dropna(subset=['Timestamp'])\n",
229
- "\n",
230
- "# df_pm25.reset_index(inplace=True)\n",
231
- "df_pm25 = df_pm25.sort_values(by=['Timestamp'])\n"
232
- ],
233
- "id": "20e9485dea763097",
234
- "outputs": [],
235
- "execution_count": null
236
- },
237
- {
238
- "metadata": {},
239
- "cell_type": "markdown",
240
- "source": "## Merge",
241
- "id": "96cf48a9f7521fcd"
242
- },
243
- {
244
- "metadata": {},
245
- "cell_type": "code",
246
- "source": [
247
- "df_merged = pd.merge_asof(df_nox,\n",
248
- " df_O3,\n",
249
- " left_on='Timestamp',\n",
250
- " right_on='Timestamp',\n",
251
- " direction='nearest')\n",
252
- "\n",
253
- "df_merged = pd.merge_asof(df_merged,\n",
254
- " df_pm10,\n",
255
- " left_on='Timestamp',\n",
256
- " right_on='Timestamp',\n",
257
- " direction='nearest')\n",
258
- "\n",
259
- "df_merged = pd.merge_asof(df_merged,\n",
260
- " df_pm25,\n",
261
- " left_on='Timestamp',\n",
262
- " right_on='Timestamp',\n",
263
- " direction='nearest')\n"
264
- ],
265
- "id": "2db2ed91c9efda4b",
266
- "outputs": [],
267
- "execution_count": null
268
- },
269
- {
270
- "metadata": {},
271
- "cell_type": "markdown",
272
- "source": "## Extract",
273
- "id": "f13105d20628b7b0"
274
- },
275
- {
276
- "metadata": {},
277
- "cell_type": "code",
278
- "source": [
279
- "df_merged.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/df_pollutants_cleaned_pivoted.parquet',\n",
280
- " index=False)\n"
281
- ],
282
- "id": "eaccdaee3f90298a",
283
- "outputs": [],
284
- "execution_count": null
285
- },
286
- {
287
- "metadata": {},
288
- "cell_type": "markdown",
289
- "source": "# Traffic",
290
- "id": "bb30b5e28f65c9bc"
291
- },
292
- {
293
- "metadata": {},
294
- "cell_type": "code",
295
- "source": [
296
- "# Convertir la colonne \"Date et heure de comptage\" en format Date\n",
297
- "df_traffic['Date et heure de comptage'] = pd.to_datetime(df_traffic[\"Date et heure de comptage\"], utc=True)\n",
298
- "df_traffic = df_traffic.rename(columns={\"Date et heure de comptage\": \"Timestamp\"})"
299
- ],
300
- "id": "de7fc1da2bf02136",
301
- "outputs": [],
302
- "execution_count": null
303
- },
304
- {
305
- "metadata": {},
306
- "cell_type": "markdown",
307
- "source": "## Clean",
308
- "id": "126558e93cf2c2a6"
309
- },
310
- {
311
- "metadata": {
312
- "ExecuteTime": {
313
- "end_time": "2025-07-09T19:38:46.212441Z",
314
- "start_time": "2025-07-09T19:38:42.317055Z"
315
- }
316
- },
317
- "cell_type": "code",
318
- "source": [
319
- "# Convertir la colonne \"Date et heure de comptage\" en format Date\n",
320
- "df_traffic['Date et heure de comptage'] = pd.to_datetime(df_traffic[\"Date et heure de comptage\"], utc=True)\n",
321
- "df_traffic = df_traffic.rename(columns={\"Date et heure de comptage\": \"Timestamp\"})\n",
322
- "\n",
323
- "# Filtrer les lignes contenant certaines valeurs dans la colonne \"Identifiant arc\"\n",
324
- "ids = [1572, 1573, 4434, 4440, 728, 737, 5442, 5455, 615, 616]\n",
325
- "\n",
326
- "# Filtrer uniquement sur les identifiants\n",
327
- "df_traffic = df_traffic[df_traffic['Identifiant arc'].isin(ids)]\n",
328
- "\n",
329
- "df_traffic = df_traffic.sort_values(by=['Timestamp'])"
330
- ],
331
- "id": "9c0ea39992c0f566",
332
- "outputs": [],
333
- "execution_count": 3
334
- },
335
- {
336
- "metadata": {},
337
- "cell_type": "code",
338
- "source": [
339
- "# Création d’un identifiant unique par site\n",
340
- "df_traffic_filtered[\"ID_Libelle\"] = df_traffic_filtered[\"Identifiant arc\"].astype(str) + \"_\" + df_traffic_filtered[\"Libelle\"]\n",
341
- "\n",
342
- "df_traffic_filtered_pivoted = df_traffic_filtered.set_index(['Timestamp', 'ID_Libelle']).unstack()\n",
343
- "\n",
344
- "df_traffic_filtered_pivoted.columns = [f\"{station}_{var}\" for var, station in df_traffic_filtered_pivoted.columns]\n",
345
- "df_traffic_filtered_pivoted = df_traffic_filtered_pivoted.reset_index()\n",
346
- "\n",
347
- "# Extraire les identifiants de station uniques\n",
348
- "ids_libelles = sorted({col.split('_')[0] for col in df_meteo_pivoted.columns if '_' in col})\n",
349
- "\n",
350
- "# Réorganiser les colonnes\n",
351
- "sorted_columns = ['Timestamp'] + [col for station in station_ids for col in df_meteo_pivoted.columns if\n",
352
- " col.startswith(station)]\n",
353
- "\n",
354
- "# Réorganiser le DataFrame\n",
355
- "df_meteo_pivoted = df_meteo_pivoted[sorted_columns]\n",
356
- "\n",
357
- "\n",
358
- "# # On \"pivot\" le DataFrame pour avoir une seule ligne par timestamp\n",
359
- "# df_traffic_filtered_pivot = df_traffic_filtered.melt(id_vars=[\"Timestamp\", \"site_id\"],\n",
360
- "# value_vars=[col for col in df_traffic_filtered.columns if col not in [\"Timestamp\", \"code_site\", \"Libelle\", \"Identifiant arc\"]],\n",
361
- "# var_name=\"variable\", value_name=\"valeur\")\n",
362
- "#\n",
363
- "# # Création des noms de colonnes finaux\n",
364
- "# df_traffic_filtered_pivot[\"colonne_finale\"] = df_traffic_filtered_pivot[\"site_id\"] + \"_\" + df_traffic_filtered_pivot[\"variable\"]\n",
365
- "#\n",
366
- "# # Restructuration du tableau\n",
367
- "# df_traffic_filtered_final = df_traffic_filtered_pivot.pivot_table(index=\"Timestamp\", columns=\"colonne_finale\", values=\"valeur\").reset_index()\n"
368
- ],
369
- "id": "af9f5b3120eeb1d",
370
- "outputs": [],
371
- "execution_count": null
372
- },
373
- {
374
- "metadata": {},
375
- "cell_type": "code",
376
- "source": [
377
- "# Création d’un identifiant unique par site\n",
378
- "df_traffic[\"ID_Libelle\"] = df_traffic[\"Identifiant arc\"].astype(str) + \"_\" + df_traffic[\n",
379
- " \"Libelle\"]\n",
380
- "df_traffic = df_traffic.drop(['Identifiant arc', 'Libelle'], axis=1)\n",
381
- "\n",
382
- "# Pivoter le DataFrame\n",
383
- "df_traffic_pivoted = df_traffic.set_index(['Timestamp', 'ID_Libelle']).unstack()"
384
- ],
385
- "id": "70ccccef23c73b1b",
386
- "outputs": [],
387
- "execution_count": null
388
- },
389
- {
390
- "metadata": {},
391
- "cell_type": "code",
392
- "source": "df_traffic_pivoted.columns",
393
- "id": "d37d22c8734776fe",
394
- "outputs": [],
395
- "execution_count": null
396
- },
397
- {
398
- "metadata": {},
399
- "cell_type": "code",
400
- "source": [
401
- "\n",
402
- "df_traffic_pivoted.columns = [f\"{station}_{var}\" for var, station in df_traffic_pivoted.columns]\n",
403
- "df_traffic_pivoted = df_traffic_pivoted.reset_index()"
404
- ],
405
- "id": "e911bbf7d54cf3c8",
406
- "outputs": [],
407
- "execution_count": null
408
- },
409
- {
410
- "metadata": {},
411
- "cell_type": "code",
412
- "source": [
413
- "\n",
414
- "# Extraire les identifiants de station uniques sans couper à chaque underscore\n",
415
- "ids_libelles = sorted({col.rsplit('_', 1)[0] for col in df_traffic_pivoted.columns if col != 'Timestamp'})\n",
416
- "\n",
417
- "# Réorganiser les colonnes par station\n",
418
- "sorted_columns = ['Timestamp'] + [\n",
419
- " col for station in ids_libelles\n",
420
- " for col in df_traffic_pivoted.columns\n",
421
- " if col.startswith(station + \"_\")\n",
422
- "]\n",
423
- "\n",
424
- "df_traffic_pivoted = df_traffic_pivoted[sorted_columns]\n"
425
- ],
426
- "id": "ad16251433d93b49",
427
- "outputs": [],
428
- "execution_count": null
429
- },
430
- {
431
- "metadata": {},
432
- "cell_type": "markdown",
433
- "source": "## Extract",
434
- "id": "973e0774ef72f46"
435
- },
436
- {
437
- "metadata": {},
438
- "cell_type": "code",
439
- "source": [
440
- "from collections import Counter\n",
441
- "\n",
442
- "# Liste des colonnes en double\n",
443
- "col_counts = Counter(df_traffic_pivoted.columns)\n",
444
- "duplicate_cols = [col for col, count in col_counts.items() if count > 1]\n",
445
- "\n",
446
- "print(\"Colonnes dupliquées :\", duplicate_cols)\n"
447
- ],
448
- "id": "c67fce5edffdd474",
449
- "outputs": [],
450
- "execution_count": null
451
- },
452
- {
453
- "metadata": {},
454
- "cell_type": "code",
455
- "source": "df_traffic_pivoted = df_traffic_pivoted.loc[:, ~df_traffic_pivoted.columns.duplicated()]\n",
456
- "id": "7f1085cb636fca55",
457
- "outputs": [],
458
- "execution_count": null
459
- },
460
- {
461
- "metadata": {},
462
- "cell_type": "code",
463
- "source": [
464
- "df_traffic_pivoted.to_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/traffic_cleaned_pivoted.parquet',\n",
465
- " index=False)"
466
- ],
467
- "id": "c9d8fc584837b7cb",
468
- "outputs": [],
469
- "execution_count": null
470
- },
471
- {
472
- "metadata": {},
473
- "cell_type": "code",
474
- "source": "df_traffic_pivoted.shape",
475
- "id": "bc51ff55f46b1d09",
476
- "outputs": [],
477
- "execution_count": null
478
- },
479
- {
480
- "metadata": {},
481
- "cell_type": "markdown",
482
- "source": "# Merge final",
483
- "id": "9971bbc11c27dbdf"
484
- },
485
- {
486
- "metadata": {
487
- "ExecuteTime": {
488
- "end_time": "2025-07-09T19:44:09.673695Z",
489
- "start_time": "2025-07-09T19:44:09.463956Z"
490
- }
491
- },
492
- "cell_type": "code",
493
- "source": [
494
- "df_traffic = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/traffic_cleaned_pivoted.parquet')\n",
495
- "df_meteo = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet')\n",
496
- "df_pollutants = pd.read_parquet(\n",
497
- " '/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/pollutants_cleaned_pivoted.parquet')\n",
498
- "\n",
499
- "df_traffic = df_traffic.sort_values(by=['Timestamp'])\n",
500
- "df_meteo = df_meteo.sort_values(by=['Timestamp'])\n",
501
- "df_pollutants = df_pollutants.sort_values(by=['Timestamp'])\n",
502
- "\n",
503
- "# Merge on the nearest time values\n",
504
- "df_merged = pd.merge_asof(df_traffic,\n",
505
- " df_meteo,\n",
506
- " left_on='Timestamp',\n",
507
- " right_on='Timestamp',\n",
508
- " direction='nearest')\n",
509
- "\n",
510
- "df_merged = pd.merge_asof(df_merged,\n",
511
- " df_pollutants,\n",
512
- " left_on='Timestamp',\n",
513
- " right_on='Timestamp',\n",
514
- " direction='nearest')\n",
515
- "\n",
516
- "df_merged = df_merged.sort_values(by=['Timestamp'])"
517
- ],
518
- "id": "ed106c330d7fe155",
519
- "outputs": [],
520
- "execution_count": 2
521
- },
522
- {
523
- "metadata": {},
524
- "cell_type": "markdown",
525
- "source": "# Upload to S3",
526
- "id": "72d1e27b43e8f51"
527
- },
528
- {
529
- "metadata": {
530
- "ExecuteTime": {
531
- "end_time": "2025-07-09T19:44:54.064492Z",
532
- "start_time": "2025-07-09T19:44:53.921133Z"
533
- }
534
- },
535
- "cell_type": "code",
536
- "source": "df_merged.to_parquet('2024_semester2_merged_v2.parquet', engine='pyarrow')\n",
537
- "id": "c5f2ca648dc532e0",
538
- "outputs": [],
539
- "execution_count": 3
540
- },
541
- {
542
- "metadata": {},
543
- "cell_type": "markdown",
544
- "source": "# CURIOSITY",
545
- "id": "e83dca08dee6a881"
546
- },
547
- {
548
- "metadata": {},
549
- "cell_type": "code",
550
- "source": [
551
- "meteo = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/meteo_cleaned_pivoted.parquet')\n",
552
- "pollutants = pd.read_parquet('/Users/a.lasnier/Desktop/dsl_ft_32/quality-air/data/pollutants_cleaned_pivoted.parquet')"
553
- ],
554
- "id": "346726ba01317db",
555
- "outputs": [],
556
- "execution_count": null
557
- }
558
- ],
559
- "metadata": {
560
- "kernelspec": {
561
- "display_name": "Python 3",
562
- "language": "python",
563
- "name": "python3"
564
- },
565
- "language_info": {
566
- "codemirror_mode": {
567
- "name": "ipython",
568
- "version": 2
569
- },
570
- "file_extension": ".py",
571
- "mimetype": "text/x-python",
572
- "name": "python",
573
- "nbconvert_exporter": "python",
574
- "pygments_lexer": "ipython2",
575
- "version": "2.7.6"
576
- }
577
- },
578
- "nbformat": 4,
579
- "nbformat_minor": 5
580
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
etl/__init__.py DELETED
File without changes
etl/traffic_rennes.py DELETED
@@ -1,63 +0,0 @@
1
- import requests
2
- import pandas as pd
3
- import logging
4
-
5
- # Configure le logger au niveau du module
6
- logging.basicConfig(level=logging.INFO)
7
- logger = logging.getLogger(__name__)
8
-
9
- def fetch_trafic_data():
10
- """Récupère les données de trafic de Rennes Métropole"""
11
- url = "https://data.rennesmetropole.fr/api/explore/v2.1/catalog/datasets/etat-du-trafic-en-temps-reel/records"
12
- params = {
13
- "select": "datetime,denomination,averagevehiclespeed,traveltime,trafficstatus",
14
- "where": "averagevehiclespeed > 0 and trafficstatus != 'unknown'",
15
- "order_by": "datetime desc",
16
- "limit": 100,
17
- "timezone": "Europe/Paris"
18
- }
19
- try:
20
- response = requests.get(url, params=params)
21
- response.raise_for_status()
22
- logger.info("✅ Données récupérées avec succès depuis l'API Rennes Métropole.")
23
- return response.json()["results"]
24
- except Exception as e:
25
- logger.error(f"❌ Erreur lors de la récupération des données : {e}")
26
- raise
27
-
28
- def process_data(data):
29
- """Nettoie les données sans les agréger"""
30
- df = pd.DataFrame(data)
31
- df["datetime"] = pd.to_datetime(df["datetime"])
32
- df["averagevehiclespeed"] = pd.to_numeric(df["averagevehiclespeed"], errors="coerce")
33
- df["traveltime"] = pd.to_numeric(df["traveltime"], errors="coerce")
34
-
35
- latest_datetime = df["datetime"].max()
36
- df_latest = df[df["datetime"] == latest_datetime]
37
-
38
- agg_df = (
39
- df_latest.groupby(["denomination", "datetime"], as_index=False)
40
- .agg({
41
- "averagevehiclespeed": "mean",
42
- "traveltime": "mean",
43
- "trafficstatus": "first"
44
- })
45
- .sort_values(by="trafficstatus", ascending=False)
46
- .reset_index(drop=True) # <-- reset index ici
47
- )
48
- logger.info(f"✅ Données de {latest_datetime} traitées avec succès. {len(agg_df)} lignes.")
49
- return agg_df, latest_datetime
50
-
51
- def main():
52
- try:
53
- data = fetch_trafic_data()
54
- agg_df, latest_datetime = process_data(data)
55
- logger.info("Aperçu des données traitées :")
56
- logger.info(agg_df.head().to_string(index=False))
57
- return agg_df, latest_datetime
58
- except Exception as e:
59
- logger.error(f"❌ Échec du pipeline : {e}")
60
- return None, None
61
-
62
- if __name__ == "__main__":
63
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
  pandas
2
  pytest
3
- requests
 
 
 
 
 
 
1
  pandas
2
  pytest
3
+ requests
4
+ apache-airflow-providers-postgres
5
+ apache-airflow-providers-amazon
6
+ scikit-learn
7
+ psycopg[binary]
8
+ python-dotenv