Spaces:

p2ov
/

streamlit_app

Sleeping

App Files Files Community

p2ov commited on Jul 10, 2025

Commit

6b90ff6

1 Parent(s): 21a79b8

split en tache unitaires de etl_main.py

Browse files

Files changed (8) hide show

airflow/dags/etl_main.py +105 -0
airflow/dags/tasks/.env +2 -0
airflow/dags/tasks/__init__.py +0 -0
airflow/dags/tasks/config.py +14 -0
airflow/dags/tasks/fetch_traffic_data.py +70 -0
airflow/dags/tasks/fetch_weather_data.py +63 -0
airflow/dags/tasks/run_model.py +74 -0
airflow/requirements.txt +7 -1

airflow/dags/etl_main.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import logging
+import json
+import pandas as pd
+from datetime import datetime
+from airflow import DAG
+from airflow.operators.dummy_operator import DummyOperator
+from airflow.operators.python_operator import PythonOperator
+from airflow.utils.task_group import TaskGroup
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+# Import des fonctions de tâches depuis tasks/
+from tasks.fetch_weather_data import fetch_weather_data
+from tasks.fetch_traffic_data import fetch_traffic_data
+from tasks.run_model import run_model
+from tasks.config import get_config
+# Import des fonctions de tâches depuis tasks/
+from tasks.fetch_weather_data import fetch_weather_data
+default_args = {
+    "owner": "airflow",
+    "start_date": datetime(2022, 6, 1)
+}
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def merge_weather_traffic_data(**context):
+    logger.info("🧪 Fusion des données météo + trafic...")
+    # Connect to our S3 bucket and download the JSON file
+    s3_hook = S3Hook(aws_conn_id="aws_default")
+    s3_bucket = get_config("S3BucketName")
+    #Input Weather values and Traffic status value
+    filename_weather = context["task_instance"].xcom_pull(key="weather_filename")
+    traffic_value = context["task_instance"].xcom_pull(key="traffic_value")
+    logger.info(f"filename_weather : {filename_weather}")
+    logger.info(f"traffic_value : {traffic_value}")
+    # Result JSON file for model as input
+    filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_weather_traffic_data.json"
+    local_weather_path = f"/tmp/{filename_weather}"
+    result_local_path = f"/tmp/{filename}"
+    with open(local_weather_path, "r") as f:
+        weather_data = json.load(f)
+    transformed = {"Pressure": weather_data["main"]["pressure"],"Temperature": weather_data["main"]["temp"],"Wind Speed": weather_data["wind"]["speed"],"Humidity": weather_data["main"]["humidity"],"Traffic Status": traffic_value}
+    df = pd.DataFrame([transformed])
+    df.to_json(result_local_path, orient="records", indent=4)
+    #S3 path in the bucket
+    s3_key = f"datasets/input/{filename}"
+    # Load it to S3
+    s3_hook.load_file(filename=result_local_path, key=s3_key, bucket_name=s3_bucket)
+    context["task_instance"].xcom_push(key="input_data_model", value=transformed)
+    logger.info(f"✅ Données transformées et sauvegardées : {s3_bucket}/{s3_key}")
+with DAG(
+    dag_id="etl_main",
+    default_args=default_args,
+    schedule_interval="@hourly",
+    catchup=False,
+) as dag:
+    start = DummyOperator(task_id="start")
+    with TaskGroup(group_id="weather_branch") as weather_branch:
+        fetch_weather_task = PythonOperator(
+            task_id="fetch_weather_data",
+            python_callable=fetch_weather_data,
+            provide_context=True,
+        )
+    with TaskGroup(group_id="traffic_branch") as traffic_branch:
+        fetch_traffic_task = PythonOperator(
+            task_id="fetch_traffic_data",
+            python_callable=fetch_traffic_data,
+            provide_context=True,
+        )
+    merge_data_task = PythonOperator(
+        task_id="merge_weather_traffic_data",
+        python_callable=merge_weather_traffic_data,
+        provide_context=True,
+    )
+    run_model = PythonOperator(
+        task_id="run_model",
+        python_callable=run_model,
+        provide_context=True,
+    )
+    end = DummyOperator(task_id="end")
+    start >> [weather_branch, traffic_branch] >> merge_data_task >> run_model >> end

airflow/dags/tasks/.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OpenWeatherApiKey=e96e90d4363a4f9e15e4c81d08ac6ed6
2	+ S3BucketName=jedha-quality-air

airflow/dags/tasks/__init__.py ADDED Viewed

File without changes

airflow/dags/tasks/config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+def get_config(key: str) -> str:
+    # Priorité à l'environnement (pour les tests locaux avec .env)
+    val = os.getenv(key)
+    if val:
+        return val
+    # Sinon, on tente Airflow (en prod uniquement)
+    try:
+        from airflow.models import Variable
+        return Variable.get(key)
+    except Exception as e:
+        raise RuntimeError(f"❌ Impossible de récupérer la variable '{key}': {e}")

airflow/dags/tasks/fetch_traffic_data.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import logging
+import pandas as pd
+import requests
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def _fetch_trafic_data():
+    url = "https://data.rennesmetropole.fr/api/explore/v2.1/catalog/datasets/etat-du-trafic-en-temps-reel/records"
+    params = {
+        "select": "datetime,denomination,averagevehiclespeed,traveltime,trafficstatus",
+        "where": "averagevehiclespeed > 0 and trafficstatus != 'unknown'",
+        "order_by": "datetime desc",
+        "limit": 100,
+        "timezone": "Europe/Paris"
+    }
+    response = requests.get(url, params=params)
+    response.raise_for_status()
+    logger.info("🚗 Données trafic récupérées avec succès.")
+    return response.json()["results"]
+def _process_traffic_data(data):
+    df = pd.DataFrame(data)
+    df["datetime"] = pd.to_datetime(df["datetime"])
+    df["averagevehiclespeed"] = pd.to_numeric(df["averagevehiclespeed"], errors="coerce")
+    df["traveltime"] = pd.to_numeric(df["traveltime"], errors="coerce")
+    latest_datetime = df["datetime"].max()
+    df_latest = df[df["datetime"] == latest_datetime]
+    agg_df = (
+        df_latest.groupby(["denomination", "datetime"], as_index=False)
+        .agg({
+            "averagevehiclespeed": "mean",
+            "traveltime": "mean",
+            "trafficstatus": "first"
+        })
+        .sort_values(by="trafficstatus", ascending=False)
+        .reset_index(drop=True)
+    )
+    agg_df["trafficstatus_numeric"] = agg_df["trafficstatus"].map({
+        "freeFlow": 0,
+        "heavy": 1,
+        "congested": 2
+    })
+    #mean_status = agg_df["trafficstatus_numeric"].mean()
+    mean_status = round(agg_df["trafficstatus_numeric"].mean(), 2)
+    logger.info(f"📊 Moyenne du trafficstatus : {mean_status}")
+    return mean_status
+def fetch_traffic_data(**context):
+    data = _fetch_trafic_data()
+    traffic_value = _process_traffic_data(data)
+    if context and "task_instance" in context:
+        context["task_instance"].xcom_push(key="traffic_value", value=traffic_value)
+    logger.info(f"✅ Traffic value pushed: {traffic_value}")
+    return traffic_value
+# ✅ Fonction main pour tester localement
+def main():
+    print("▶️ Test local : recuperation du taux de traffic entre 0 et 2, 0 aucun trafic, 2 congestionné")
+    traffic_value = fetch_traffic_data()
+    print(f"✅ Moyenne récupérée : {traffic_value}")
+if __name__ == "__main__":
+    main()

airflow/dags/tasks/fetch_weather_data.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import json
+import logging
+import requests
+from datetime import datetime
+from dotenv import load_dotenv
+load_dotenv()  # charge automatiquement le fichier .env dans os.environ
+# Charger la configuration (depuis Airflow ou .env)
+from tasks.config import get_config
+#print("✅ .env chargé, clé API :", os.getenv("OpenWeatherApiKey"))
+# Utilise le S3Hook uniquement si disponible (dans Airflow)
+try:
+    from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+except ImportError:
+    S3Hook = None
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def fetch_weather_data(**context):
+    logger.info("📡 Fetching weather data from OpenWeatherMap")
+    api_key = get_config("OpenWeatherApiKey")
+    s3_bucket = get_config("S3BucketName")
+    url = f"https://api.openweathermap.org/data/2.5/weather?q=Paris&appid={api_key}&units=metric"
+    response = requests.get(url)
+    response.raise_for_status()
+    filename = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_openweather_data.json"
+    local_path = f"/tmp/{filename}"
+    s3_key = f"datasets/input/meteo/{filename}"
+    with open(local_path, "w") as f:
+        json.dump(response.json(), f)
+    if S3Hook is not None:
+        s3_hook = S3Hook(aws_conn_id="aws_default")
+        s3_hook.load_file(filename=local_path, key=s3_key, bucket_name=s3_bucket, replace=True)
+        if context and "task_instance" in context:
+            context["task_instance"].xcom_push(key="weather_filename", value=filename)
+        logger.info(f"✅ Uploaded to S3: {s3_key}")
+    else:
+        logger.info("ℹ️ S3Hook non disponible (hors Airflow). Fichier seulement écrit en local.")
+    return filename
+# ✅ Fonction main pour tester localement
+def main():
+    print("▶️ Test local : récupération météo")
+    filename = fetch_weather_data()
+    print(f"✅ Fichier météo généré : /tmp/{filename}")
+if __name__ == "__main__":
+    main()

airflow/dags/tasks/run_model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import logging
+import json
+import pandas as pd
+import requests
+from datetime import datetime
+import pickle
+from sklearn.linear_model import LinearRegression
+from dotenv import load_dotenv
+load_dotenv()  # charge automatiquement le fichier .env dans os.environ
+# Charger la configuration (depuis Airflow ou .env)
+from tasks.config import get_config
+# Utilise le S3Hook uniquement si disponible (dans Airflow)
+try:
+    from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+except ImportError:
+    S3Hook = None
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+def run_model(**context):
+    logger.info("📡 Launch model on data")
+    model_filename = "linear_model_2025_07_10_16_28_59.pkl"
+    #Je suis dans Airflow
+    if S3Hook is not None and context is not None:
+        s3_bucket = get_config("S3BucketName")
+        s3_path = 'models/'
+        # Connect to our S3 bucket and download the model file
+        s3_hook = S3Hook(aws_conn_id="aws_default")
+        filename_model = s3_hook.download_file(s3_path+model_filename, bucket_name=s3_bucket, local_path="/tmp")
+        #Input Weather values and Traffic status value
+        input_data_model = context["task_instance"].xcom_pull(key="input_data_model")
+        #input_data_model = {"Pressure": 1021, "Temperature": 22.73, "Wind Speed": 4.12, "Humidity": 59, "Traffic Status": 0.23}
+        print(type(input_data_model))
+        logger.info(f"✅ input data model : {input_data_model}")
+    else:
+        #recuperer le model dans git ou ailleurs
+        logger.info(f"✅ Model local used en local: {model_filename}")
+        filename_model = '../data/'+model_filename
+        # test the model with random values
+        input_data_model = {
+            "Pressure": 999,
+            "Temperature": 22,
+            "Wind Speed": 10,
+            "Humidity": 50,
+            "Traffic Status": 0,
+        }
+    with open(filename_model, "rb") as f:
+        model = pickle.load(f)
+    raw_predictions = model.predict(pd.DataFrame([input_data_model]))
+    # clip the predictions to be above 0
+    clipped_predictions = raw_predictions.clip(0, 1e6).tolist()
+    logger.info(f"✅ Clipped Predictions : {clipped_predictions}")
+    return clipped_predictions
+# ✅ Fonction main pour tester localement (verifier que le fichier model est présent)
+def main():
+    print("▶️ Test local du modele de prediction")
+    result = run_model()
+    print(f"✅ Clipped Predictions : {result}")
+if __name__ == "__main__":
+    main()

airflow/requirements.txt CHANGED Viewed

	@@ -1 +1,7 @@
1	- ~~pandas~~

+apache-airflow-providers-postgres
+apache-airflow-providers-amazon
+scikit-learn
+psycopg[binary]
+pandas
+python-dotenv
+pytest