Spaces:

p2ov
/

streamlit_app

Sleeping

App Files Files Community

p2ov commited on Jul 10, 2025

Commit

21a79b8

1 Parent(s): 11d74b7

DAG sans modele et à séparer

Browse files

Files changed (1) hide show

airflow/dags/quality_air_etl.py +107 -61

airflow/dags/quality_air_etl.py CHANGED Viewed

@@ -9,6 +9,7 @@ Also set the connection for the Postgres database and the AWS account.
 import json
 import logging
 from datetime import datetime
 import pandas as pd
 import requests
@@ -21,18 +22,24 @@ from airflow.providers.postgres.operators.postgres import PostgresOperator
 from s3_to_postgres import S3ToPostgresOperator
 from airflow.utils.task_group import TaskGroup
 default_args = {
     "owner": "airflow",
     "start_date": datetime(2022, 6, 1),
 }
 def _fetch_weather_data(**context):
-    """Fetches data from WeatherBit API and save it to S3.
     """
-    logging.info(f"Fetching weather data")
     # Get the API key from the Variables
     api_key = Variable.get("OpenWeatherApiKey")
-    # Fetch WeatherBit
     full_url = f"https://api.openweathermap.org/data/2.5/weather?q=Paris&appid={api_key}&units=metric"
     response = requests.get(full_url)
     # We create a filename like: 20220601-123000_weather_data.json
@@ -52,90 +59,129 @@ def _fetch_weather_data(**context):
     context["task_instance"].xcom_push(key="weather_filename", value=filename)
     logging.info(f"Saved weather data to {filename}")
-def _transform_weather_data(**context):
     """Transforms raw data from JSON file to ingestable data for Postgres.
     """
     # We get the filename from the context
-    filename = context["task_instance"].xcom_pull(key="weather_filename")
     # Connect to our S3 bucket and download the JSON file
     s3_hook = S3Hook(aws_conn_id="aws_default")
     s3_path = 'datasets/input/meteo/'
-    returned_filename = s3_hook.download_file(s3_path+filename, bucket_name=Variable.get("S3BucketName"), local_path="/tmp")
-    with open(returned_filename, "r") as f:
-        raw_data_json = json.load(f)
-    # We transform the data into a pandas DataFrame
-    #raw_data_json = raw_data_json[0]
     transformed_data = {
-        "name" : raw_data_json["name"],
-        "coord_lon": raw_data_json["coord"]["lon"],
-        "coord_lat": raw_data_json["coord"]["lat"],
-        "weather_main" : raw_data_json["weather"][0]["main"],
-        "weather_description" : raw_data_json["weather"][0]["description"],
-        "main_temp" : raw_data_json["main"]["temp"],
-        "main_pressure" : raw_data_json["main"]["pressure"],
-        "main_humidity" : raw_data_json["main"]["humidity"],
-        "wind_speed" : raw_data_json["wind"]["speed"],
-        "wind_deg" : raw_data_json["wind"]["deg"]
     }
     df = pd.DataFrame(transformed_data, index=[0])
     # Keep the same filename between the JSON file and the CSV
     csv_filename = filename.split(".")[0] + ".csv"
     csv_filename_full_path = f"/tmp/{csv_filename}"
-    s3_csv_key = 'datasets/input/meteo/'+ csv_filename
     # Save it temporarily in /tmp folder
     df.to_csv(csv_filename_full_path, index=False, header=False)
     # Load it to S3
     s3_hook.load_file(filename=csv_filename_full_path, key=s3_csv_key, bucket_name=Variable.get("S3BucketName"))
     # Push the filename to the context so that we can use it later
-    context["task_instance"].xcom_push(key="weather_csv_filename", value=s3_csv_key)
 with DAG(dag_id="quality_air_etl_dag", default_args=default_args, schedule_interval="@hourly", catchup=False) as dag:
     start = DummyOperator(task_id="start")
     with TaskGroup(group_id="weather_branch") as weather_branch:
         fetch_weather_data = PythonOperator(task_id="fetch_weather_data", python_callable=_fetch_weather_data)
-        transform_weather_data = PythonOperator(
-            task_id="transform_weather_data",
-            python_callable=_transform_weather_data
-        )
-        create_weather_table = PostgresOperator(
-            task_id="create_weather_table",
-            # In the SQL do not forget to put `IF NOT EXISTS`
-            sql="""
-            CREATE TABLE IF NOT EXISTS weather_data (
-                id SERIAL PRIMARY KEY,
-                observation_time TIMESTAMP,
-                name VARCHAR,
-                weather_main VARCHAR,
-                weather_description VARCHAR,
-                coord_lon DECIMAL(5, 2),
-                coord_lat DECIMAL(5, 2),
-                main_temp DECIMAL(5, 2),
-                main_humidity DECIMAL(5, 2),
-                main_pressure DECIMAL(5, 2),
-                wind_speed DECIMAL(5, 2),
-                wind_deg DECIMAL(5, 2)
-            )
-            """,
-            postgres_conn_id="postgres_default",
-        )
-        transfer_weather_data_to_postgres = S3ToPostgresOperator(
-            task_id="transfer_weather_data_to_postgres",
-            table="weather_data",
-            bucket="{{ var.value.S3BucketName }}",
-            key="{{ task_instance.xcom_pull(key='weather_csv_filename') }}",
-            postgres_conn_id="postgres_default",
-            aws_conn_id="aws_default",
-        )
-        fetch_weather_data >> transform_weather_data >> create_weather_table >> transfer_weather_data_to_postgres
     end = DummyOperator(task_id="end")
-    start >> [weather_branch] >> end

 import json
 import logging
 from datetime import datetime
+from zoneinfo import ZoneInfo
 import pandas as pd
 import requests
 from s3_to_postgres import S3ToPostgresOperator
 from airflow.utils.task_group import TaskGroup
 default_args = {
     "owner": "airflow",
     "start_date": datetime(2022, 6, 1),
 }
+# Configure le logger au niveau du module
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+paris_time = datetime.now(ZoneInfo("Europe/Paris"))
 def _fetch_weather_data(**context):
+    """Fetches data from OpenWeatherMap API and save it to S3.
     """
+    logging.info(f"Fetching Weather data")
     # Get the API key from the Variables
     api_key = Variable.get("OpenWeatherApiKey")
+    # Fetch OpenWeatherMap
     full_url = f"https://api.openweathermap.org/data/2.5/weather?q=Paris&appid={api_key}&units=metric"
     response = requests.get(full_url)
     # We create a filename like: 20220601-123000_weather_data.json
     context["task_instance"].xcom_push(key="weather_filename", value=filename)
     logging.info(f"Saved weather data to {filename}")
+def _fetch_trafic_data():
+    """Récupère les données de trafic de Rennes Métropole"""
+    url = "https://data.rennesmetropole.fr/api/explore/v2.1/catalog/datasets/etat-du-trafic-en-temps-reel/records"
+    params = {
+        "select": "datetime,denomination,averagevehiclespeed,traveltime,trafficstatus",
+        "where": "averagevehiclespeed > 0 and trafficstatus != 'unknown'",
+        "order_by": "datetime desc",
+        "limit": 100,
+        "timezone": "Europe/Paris"
+    }
+    try:
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        logger.info("✅ Données récupérées avec succès depuis l'API Rennes Métropole.")
+        return response.json()["results"]
+    except Exception as e:
+        logger.error(f"❌ Erreur lors de la récupération des données : {e}")
+        raise
+def _process_traffic_data(data):
+    """Nettoie les données sans les agréger"""
+    df = pd.DataFrame(data)
+    df["datetime"] = pd.to_datetime(df["datetime"])
+    df["averagevehiclespeed"] = pd.to_numeric(df["averagevehiclespeed"], errors="coerce")
+    df["traveltime"] = pd.to_numeric(df["traveltime"], errors="coerce")
+    latest_datetime = df["datetime"].max()
+    df_latest = df[df["datetime"] == latest_datetime]
+    agg_df = (
+        df_latest.groupby(["denomination", "datetime"], as_index=False)
+        .agg({
+            "averagevehiclespeed": "mean",
+            "traveltime": "mean",
+            "trafficstatus": "first"
+        })
+        .sort_values(by="trafficstatus", ascending=False)
+        .reset_index(drop=True)  # <-- reset index ici
+    )
+    # Remplacer les valeurs textuelles de trafficstatus par des valeurs numériques
+    agg_df["trafficstatus_numeric"] = agg_df["trafficstatus"].map({"heavy": 1, "freeFlow":0, "congested":2})
+    # Calculer la moyenne (si tu veux l'afficher ou l'utiliser plus tard)
+    mean_trafficstatus = agg_df["trafficstatus_numeric"].mean()
+    logger.info(f"📊 Moyenne du trafficstatus (freeFlow=0,heavy=1,congested=2): {mean_trafficstatus:.2f}")
+    return mean_trafficstatus
+def _get_traffic_data(**context):
+    data = _fetch_trafic_data()
+    traffic_value = _process_traffic_data(data)
+    context["task_instance"].xcom_push(key="traffic_value", value=traffic_value)
+    logging.info(f"Saved traffic value : {traffic_value}")
+def _transform_weather_traffic_data(**context):
     """Transforms raw data from JSON file to ingestable data for Postgres.
     """
+    # We create a filename like: 20220601-123000_weather_traffic_data.json
+    filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_weather_traffic_data.json"
     # We get the filename from the context
+    filename_weather = context["task_instance"].xcom_pull(key="weather_filename")
     # Connect to our S3 bucket and download the JSON file
     s3_hook = S3Hook(aws_conn_id="aws_default")
     s3_path = 'datasets/input/meteo/'
+    returned_filename_weather = s3_hook.download_file(s3_path+filename_weather, bucket_name=Variable.get("S3BucketName"), local_path="/tmp")
+    with open(returned_filename_weather, "r") as f_weather:
+        raw_data_json_weather = json.load(f_weather)
+    #Weather CSV like : Paris,Clear,clear sky,2.3488,48.8534,25.51,1023,43,3.6,360
+    #Traffic value : Between 0 and 2
     transformed_data = {
+        "main_temp" : raw_data_json_weather["main"]["temp"],
+        "main_pressure" : raw_data_json_weather["main"]["pressure"],
+        "main_humidity" : raw_data_json_weather["main"]["humidity"],
+        "wind_speed" : raw_data_json_weather["wind"]["speed"],
+        "traffic" : context["task_instance"].xcom_pull(key="traffic_value")
     }
     df = pd.DataFrame(transformed_data, index=[0])
     # Keep the same filename between the JSON file and the CSV
     csv_filename = filename.split(".")[0] + ".csv"
     csv_filename_full_path = f"/tmp/{csv_filename}"
+    s3_csv_key = 'datasets/input/'+ csv_filename
     # Save it temporarily in /tmp folder
     df.to_csv(csv_filename_full_path, index=False, header=False)
     # Load it to S3
     s3_hook.load_file(filename=csv_filename_full_path, key=s3_csv_key, bucket_name=Variable.get("S3BucketName"))
     # Push the filename to the context so that we can use it later
+    context["task_instance"].xcom_push(key="input_data_csv_filename", value=s3_csv_key)
 with DAG(dag_id="quality_air_etl_dag", default_args=default_args, schedule_interval="@hourly", catchup=False) as dag:
     start = DummyOperator(task_id="start")
     with TaskGroup(group_id="weather_branch") as weather_branch:
         fetch_weather_data = PythonOperator(task_id="fetch_weather_data", python_callable=_fetch_weather_data)
+        fetch_weather_data
+    with TaskGroup(group_id="traffic_branch") as traffic_branch:
+        fetch_traffic_data = PythonOperator(task_id="fetch_traffic_data", python_callable=_get_traffic_data)
+        fetch_traffic_data
+    with TaskGroup(group_id="ml_branch") as ml_branch:
+        get_input_meteo_traffic_csv = DummyOperator(task_id="get_input_meteo_traffic_csv")
+        pull_run_model = DummyOperator(task_id="pull_run_model")
+        get_input_meteo_traffic_csv >> pull_run_model
+    transform_weather_traffic_data = PythonOperator(
+       task_id="transform_weather_traffic_data", python_callable=_transform_weather_traffic_data
+    )
     end = DummyOperator(task_id="end")
+    start >> [weather_branch, traffic_branch] >> transform_weather_traffic_data >> ml_branch >> end