Spaces:

p2ov
/

streamlit_app

Sleeping

App Files Files Community

Alquilar78 commited on Jul 11, 2025

Commit

b7ff5e0

1 Parent(s): 9b10f02

Cleaning dossier

Browse files

Files changed (4) hide show

airflow/dags/etl_main.py +2 -2
airflow/dags/quality_air_etl.py +0 -187
airflow/dags/tasks/fetch_weather_data.py +7 -2
airflow/dags/test.py +0 -6

airflow/dags/etl_main.py CHANGED Viewed

@@ -44,8 +44,8 @@ def merge_weather_traffic_data(**context):
     filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_weather_traffic_data.json"
-    local_weather_path = f"/tmp/{filename_weather}"
-    result_local_path = f"/tmp/{filename}"
     with open(local_weather_path, "r") as f:
         weather_data = json.load(f)

     filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_weather_traffic_data.json"
+    local_weather_path = f"/opt/airflow/data/{filename_weather}"
+    result_local_path = f"/opt/airflow/data/{filename}"
     with open(local_weather_path, "r") as f:
         weather_data = json.load(f)

airflow/dags/quality_air_etl.py DELETED Viewed

@@ -1,187 +0,0 @@
-"""
-To use this DAG, you need to set some variables within the Airflow UI:
-- `WeatherBitApiKey`: the API key to use to access the WeatherBit API.
-- `S3BucketName`: the name of the S3 bucket where the data will be stored.
-Also set the connection for the Postgres database and the AWS account.
-"""
-import json
-import logging
-from datetime import datetime
-from zoneinfo import ZoneInfo
-import pandas as pd
-import requests
-from airflow import DAG
-from airflow.hooks.S3_hook import S3Hook
-from airflow.models import Variable
-from airflow.operators.dummy_operator import DummyOperator
-from airflow.operators.python_operator import PythonOperator
-from airflow.providers.postgres.operators.postgres import PostgresOperator
-from s3_to_postgres import S3ToPostgresOperator
-from airflow.utils.task_group import TaskGroup
-default_args = {
-    "owner": "airflow",
-    "start_date": datetime(2022, 6, 1),
-}
-# Configure le logger au niveau du module
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-paris_time = datetime.now(ZoneInfo("Europe/Paris"))
-def _fetch_weather_data(**context):
-    """Fetches data from OpenWeatherMap API and save it to S3.
-    """
-    logging.info(f"Fetching Weather data")
-    # Get the API key from the Variables
-    api_key = Variable.get("OpenWeatherApiKey")
-    # Fetch OpenWeatherMap
-    full_url = f"https://api.openweathermap.org/data/2.5/weather?q=Paris&appid={api_key}&units=metric"
-    response = requests.get(full_url)
-    # We create a filename like: 20220601-123000_weather_data.json
-    filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_openweather_data.json"
-    # Let's temprorary save this file into /tmp folder
-    full_path_to_file = f"/tmp/{filename}"
-    s3_path = 'datasets/input/meteo/'
-    with open(full_path_to_file, "w") as f:
-        json.dump(response.json(), f)
-    # Connect to our S3 bucket and load the file
-    # filename is the path to our file and key is the full path inside the
-    # bucket
-    s3_hook = S3Hook(aws_conn_id="aws_default")
-    s3_hook.load_file(filename=full_path_to_file, key=s3_path+filename, bucket_name=Variable.get("S3BucketName"))
-    # Let's push the filename to the context so that we can use it later
-    context["task_instance"].xcom_push(key="weather_filename", value=filename)
-    logging.info(f"Saved weather data to {filename}")
-def _fetch_trafic_data():
-    """Récupère les données de trafic de Rennes Métropole"""
-    url = "https://data.rennesmetropole.fr/api/explore/v2.1/catalog/datasets/etat-du-trafic-en-temps-reel/records"
-    params = {
-        "select": "datetime,denomination,averagevehiclespeed,traveltime,trafficstatus",
-        "where": "averagevehiclespeed > 0 and trafficstatus != 'unknown'",
-        "order_by": "datetime desc",
-        "limit": 100,
-        "timezone": "Europe/Paris"
-    }
-    try:
-        response = requests.get(url, params=params)
-        response.raise_for_status()
-        logger.info("✅ Données récupérées avec succès depuis l'API Rennes Métropole.")
-        return response.json()["results"]
-    except Exception as e:
-        logger.error(f"❌ Erreur lors de la récupération des données : {e}")
-        raise
-def _process_traffic_data(data):
-    """Nettoie les données sans les agréger"""
-    df = pd.DataFrame(data)
-    df["datetime"] = pd.to_datetime(df["datetime"])
-    df["averagevehiclespeed"] = pd.to_numeric(df["averagevehiclespeed"], errors="coerce")
-    df["traveltime"] = pd.to_numeric(df["traveltime"], errors="coerce")
-    latest_datetime = df["datetime"].max()
-    df_latest = df[df["datetime"] == latest_datetime]
-    agg_df = (
-        df_latest.groupby(["denomination", "datetime"], as_index=False)
-        .agg({
-            "averagevehiclespeed": "mean",
-            "traveltime": "mean",
-            "trafficstatus": "first"
-        })
-        .sort_values(by="trafficstatus", ascending=False)
-        .reset_index(drop=True)  # <-- reset index ici
-    )
-    # Remplacer les valeurs textuelles de trafficstatus par des valeurs numériques
-    agg_df["trafficstatus_numeric"] = agg_df["trafficstatus"].map({"heavy": 1, "freeFlow":0, "congested":2})
-    # Calculer la moyenne (si tu veux l'afficher ou l'utiliser plus tard)
-    mean_trafficstatus = agg_df["trafficstatus_numeric"].mean()
-    logger.info(f"📊 Moyenne du trafficstatus (freeFlow=0,heavy=1,congested=2): {mean_trafficstatus:.2f}")
-    return mean_trafficstatus
-def _get_traffic_data(**context):
-    data = _fetch_trafic_data()
-    traffic_value = _process_traffic_data(data)
-    context["task_instance"].xcom_push(key="traffic_value", value=traffic_value)
-    logging.info(f"Saved traffic value : {traffic_value}")
-def _transform_weather_traffic_data(**context):
-    """Transforms raw data from JSON file to ingestable data for Postgres.
-    """
-    # We create a filename like: 20220601-123000_weather_traffic_data.json
-    filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_weather_traffic_data.json"
-    # We get the filename from the context
-    filename_weather = context["task_instance"].xcom_pull(key="weather_filename")
-    # Connect to our S3 bucket and download the JSON file
-    s3_hook = S3Hook(aws_conn_id="aws_default")
-    s3_path = 'datasets/input/meteo/'
-    returned_filename_weather = s3_hook.download_file(s3_path+filename_weather, bucket_name=Variable.get("S3BucketName"), local_path="/tmp")
-    with open(returned_filename_weather, "r") as f_weather:
-        raw_data_json_weather = json.load(f_weather)
-    #Weather CSV like : Paris,Clear,clear sky,2.3488,48.8534,25.51,1023,43,3.6,360
-    #Traffic value : Between 0 and 2
-    transformed_data = {
-        "main_temp" : raw_data_json_weather["main"]["temp"],
-        "main_pressure" : raw_data_json_weather["main"]["pressure"],
-        "main_humidity" : raw_data_json_weather["main"]["humidity"],
-        "wind_speed" : raw_data_json_weather["wind"]["speed"],
-        "traffic" : context["task_instance"].xcom_pull(key="traffic_value")
-    }
-    df = pd.DataFrame(transformed_data, index=[0])
-    # Keep the same filename between the JSON file and the CSV
-    csv_filename = filename.split(".")[0] + ".csv"
-    csv_filename_full_path = f"/tmp/{csv_filename}"
-    s3_csv_key = 'datasets/input/'+ csv_filename
-    # Save it temporarily in /tmp folder
-    df.to_csv(csv_filename_full_path, index=False, header=False)
-    # Load it to S3
-    s3_hook.load_file(filename=csv_filename_full_path, key=s3_csv_key, bucket_name=Variable.get("S3BucketName"))
-    # Push the filename to the context so that we can use it later
-    context["task_instance"].xcom_push(key="input_data_csv_filename", value=s3_csv_key)
-with DAG(dag_id="quality_air_etl_dag", default_args=default_args, schedule_interval="@hourly", catchup=False) as dag:
-    start = DummyOperator(task_id="start")
-    with TaskGroup(group_id="weather_branch") as weather_branch:
-        fetch_weather_data = PythonOperator(task_id="fetch_weather_data", python_callable=_fetch_weather_data)
-        fetch_weather_data
-    with TaskGroup(group_id="traffic_branch") as traffic_branch:
-        fetch_traffic_data = PythonOperator(task_id="fetch_traffic_data", python_callable=_get_traffic_data)
-        fetch_traffic_data
-    with TaskGroup(group_id="ml_branch") as ml_branch:
-        get_input_meteo_traffic_csv = DummyOperator(task_id="get_input_meteo_traffic_csv")
-        pull_run_model = DummyOperator(task_id="pull_run_model")
-        get_input_meteo_traffic_csv >> pull_run_model
-    transform_weather_traffic_data = PythonOperator(
-       task_id="transform_weather_traffic_data", python_callable=_transform_weather_traffic_data
-    )
-    end = DummyOperator(task_id="end")
-    start >> [weather_branch, traffic_branch] >> transform_weather_traffic_data >> ml_branch >> end

airflow/dags/tasks/fetch_weather_data.py CHANGED Viewed

@@ -34,7 +34,12 @@ def fetch_weather_data(**context):
     response.raise_for_status()
     filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_openweather_data.json"
-    local_path = f"/tmp/{filename}"
     s3_key = f"datasets/input/meteo/{filename}"
     with open(local_path, "w") as f:
@@ -56,7 +61,7 @@ def fetch_weather_data(**context):
 def main():
     print("▶️ Test local : récupération météo")
     filename = fetch_weather_data()
-    print(f"✅ Fichier météo généré : /tmp/{filename}")
 if __name__ == "__main__":

     response.raise_for_status()
     filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_openweather_data.json"
+    if context:
+        local_path = f"/opt/airflow/data/{filename}"
+    else:
+        local_path = f"../data/{filename}"
     s3_key = f"datasets/input/meteo/{filename}"
     with open(local_path, "w") as f:
 def main():
     print("▶️ Test local : récupération météo")
     filename = fetch_weather_data()
+    print(f"✅ Fichier météo généré : {filename}")
 if __name__ == "__main__":

airflow/dags/test.py DELETED Viewed

@@ -1,6 +0,0 @@
-from airflow import DAG
-from datetime import datetime
-import pandas as pd
-with DAG("crypto_dag", start_date=datetime(2022, 1, 1), schedule_interval="@hourly", catchup=False) as dag:
-    pass