Spaces:

SlimG
/

tennis-ml

Running

App Files Files Community

SlimG commited on May 22, 2025

Commit

5ef0c38

1 Parent(s): 863ede9

rework the data quality check

Browse files

Files changed (3) hide show

.env.example +0 -1
src/main.py +16 -20
src/service/data_quality.py +58 -24

.env.example CHANGED Viewed

@@ -11,4 +11,3 @@ AWS_SECRET_ACCESS_KEY=
 # Data quality
 EVIDENTLY_API_KEY=
 EVIDENTLY_PROJECT_ID=
-EVIDENTLY_REF_DATASET_ID=

 # Data quality
 EVIDENTLY_API_KEY=
 EVIDENTLY_PROJECT_ID=

src/main.py CHANGED Viewed

@@ -23,8 +23,7 @@ from dotenv import load_dotenv
 from mlflow.exceptions import RestException
 from src.entity.model import ModelInput, ModelOutput
-from src.repository.model_data import load_model_data
-from src.service.data_quality import DataChecker
 from src.service.model import (
     run_experiment,
     predict,
@@ -172,10 +171,8 @@ async def deploy_model_to_production(
         logger.error(e)
         # Return HTTP error 404
-        return HTTPException(
-            status_code=HTTP_404_NOT_FOUND,
-            detail=f"Model {model_name} (version {version}) not found"
-        )
     return {"message": f"Model {model_name} deployed to production"}
@@ -191,34 +188,33 @@ async def undeploy_model_from_production(model_name: str = Query(description="Th
         logger.error(e)
         # Return HTTP error 404
-        return HTTPException(
-            status_code=HTTP_404_NOT_FOUND,
-            detail=f"Model {model_name} not found or not in production"
-        )
     return {"message": f"Model {model_name} undeployed from production"}
 @app.get("/check_data_quality", tags=["data"], description="Check the data quality")
-async def check_data_quality(background_tasks: BackgroundTasks):
     """
     Check the data quality
     """
     # Get the API key and project ID from the environment variables
     api_key = os.getenv("EVIDENTLY_API_KEY")
-    project_id = os.getenv("EVIDENTLY_PROJECT_ID")
-    ref_dataset_id = os.getenv("EVIDENTLY_REF_DATASET_ID")
     # Check if the API key and project ID are set
-    if not api_key or not project_id or not ref_dataset_id:
-        return JSONResponse(content={"status": "unhealthy", "detail": "Evidently API key or project ID not set"},
                             status_code=HTTP_503_SERVICE_UNAVAILABLE)
-    # Get the newest data from the database
-    df = load_model_data()
     # Schedule the data quality check
-    background_tasks.add_task(func=DataChecker(api_key, project_id, ref_dataset_id).check_data,
-                              df=df)
     return {"message": "Data quality check scheduled"}

 from mlflow.exceptions import RestException
 from src.entity.model import ModelInput, ModelOutput
+from src.service.data_quality import DataChecker, check_model_data
 from src.service.model import (
     run_experiment,
     predict,
         logger.error(e)
         # Return HTTP error 404
+        return JSONResponse(content={"message": f"Model {model_name} (version {version}) not found"},
+                            status_code=HTTP_404_NOT_FOUND)
     return {"message": f"Model {model_name} deployed to production"}
         logger.error(e)
         # Return HTTP error 404
+        return JSONResponse(content={"message": f"Model {model_name} not found or not in production"},
+                            status_code=HTTP_404_NOT_FOUND)
     return {"message": f"Model {model_name} undeployed from production"}
 @app.get("/check_data_quality", tags=["data"], description="Check the data quality")
+async def check_data_quality(
+    background_tasks: BackgroundTasks,
+    model_name: str = Query(description="The name of the model to check"),
+    project_id: Optional[str] = Query(default=None, description="The ID of the project to send the data quality report to"),
+):
     """
     Check the data quality
     """
     # Get the API key and project ID from the environment variables
     api_key = os.getenv("EVIDENTLY_API_KEY")
+    project_id = project_id or os.getenv("EVIDENTLY_PROJECT_ID")
     # Check if the API key and project ID are set
+    if not api_key or not project_id:
+        return JSONResponse(content={"message": "Evidently API key or project ID not set"},
                             status_code=HTTP_503_SERVICE_UNAVAILABLE)
     # Schedule the data quality check
+    background_tasks.add_task(func=check_model_data,
+                              model_name=model_name,
+                              checker=DataChecker(api_key, project_id))
     return {"message": "Data quality check scheduled"}

src/service/data_quality.py CHANGED Viewed

@@ -1,14 +1,39 @@
 import pandas as pd
-from typing import Optional
 from evidently import Dataset, DataDefinition, Report
 from evidently.presets import DataSummaryPreset, DataDriftPreset
 from evidently.ui.workspace import CloudWorkspace
 class DataChecker:
-    def __init__(self, api_key: str, project_id: str, ref_dataset_id: str):
         self._api_key = api_key
         self._project_id = project_id
-        self._ref_dataset_id = ref_dataset_id
         self.workspace = CloudWorkspace(
             token=api_key,
@@ -23,26 +48,14 @@ class DataChecker:
         if not self.project:
             raise ValueError("Project not found. Please check your project ID.")
-        self.ref_dataset = self.workspace.load_dataset(ref_dataset_id)
-        if not self.ref_dataset:
-            raise ValueError("Reference dataset not found. Please check your reference dataset ID.")
-    def check_data(self, df: pd.DataFrame, schema: Optional[DataDefinition] = None) -> str:
-        if not schema:
-            schema = DataDefinition(
-                id_column="match_id",
-                datetime_columns=["date"],
-                numerical_columns=["winner_rank", "loser_rank", "winner_points", "loser_points", "w_height_cm", "w_weight_kg",
-                                "w_year_of_birth", "w_pro_year", "l_height_cm", "l_weight_kg", "l_year_of_birth", "l_pro_year"],
-                categorical_columns=["tournament_name", "tournament_series", "tournament_surface", "tournament_court",
-                                "tournament_location", "winner_name", "w_first_name", "w_last_name", "w_play_hand",
-                                "w_back_hand", "loser_name", "l_first_name", "l_last_name", "l_play_hand", "l_back_hand"],
-            )
         eval_data = Dataset.from_pandas(
             data=df,
-            data_definition=schema
         )
         report = Report(
@@ -54,8 +67,29 @@ class DataChecker:
         )
         # Run the report
-        my_eval = report.run(eval_data, self.ref_dataset)
         # Save the evaluation to the workspace
-        return self.workspace.add_run(self.project.id, my_eval, include_data=False)

 import pandas as pd
+from typing import Dict, Optional, List
 from evidently import Dataset, DataDefinition, Report
 from evidently.presets import DataSummaryPreset, DataDriftPreset
 from evidently.ui.workspace import CloudWorkspace
+from src.repository.model_data import load_model_data
+from src.service.model import get_training_dataset
+import logging
+logger = logging.getLogger(__name__)
 class DataChecker:
+    schemas: Dict[str, DataDefinition] = {
+        "raw": DataDefinition(
+            id_column="match_id",
+            datetime_columns=["date"],
+            numerical_columns=["winner_rank", "loser_rank", "winner_points", "loser_points", "w_height_cm", "w_weight_kg",
+                            "w_year_of_birth", "w_pro_year", "l_height_cm", "l_weight_kg", "l_year_of_birth", "l_pro_year"],
+            categorical_columns=["tournament_name", "tournament_series", "tournament_surface", "tournament_court",
+                            "tournament_location", "winner_name", "w_first_name", "w_last_name", "w_play_hand",
+                            "w_back_hand", "loser_name", "l_first_name", "l_last_name", "l_play_hand", "l_back_hand"],
+        ),
+        "cleaned": DataDefinition(
+            numerical_columns=["diff_rank", "mean_rank",
+                               "diff_height_cm", "mean_height_cm",
+                               "diff_weight_kg", "mean_weight_kg",
+                               "diff_nb_pro_years", "diff_age",
+                               "diff_play_hand", "diff_back_hand"],
+            categorical_columns=["tournament_series", "tournament_surface", "tournament_court",],
+        )
+    }
+    def __init__(self, api_key: str, project_id: str):
         self._api_key = api_key
         self._project_id = project_id
         self.workspace = CloudWorkspace(
             token=api_key,
         if not self.project:
             raise ValueError("Project not found. Please check your project ID.")
+    def check_data(self, df: pd.DataFrame, ref_df: pd.DataFrame, tags: Optional[List[str]] = None) -> str:
         eval_data = Dataset.from_pandas(
             data=df,
+            data_definition=self.schemas['cleaned']
+        )
+        ref_data = Dataset.from_pandas(
+            data=ref_df,
+            data_definition=self.schemas['cleaned']
         )
         report = Report(
         )
         # Run the report
+        logger.info("Running the report...")
+        my_eval = report.run(current_data=eval_data, reference_data=ref_data, tags=tags)
         # Save the evaluation to the workspace
+        logger.info("Saving the evaluation to the workspace...")
+        snapshot_id = self.workspace.add_run(self.project.id, my_eval, include_data=False)
+        logger.info(f"Evaluation saved with snapshot ID: {snapshot_id}")
+        return snapshot_id
+def check_model_data(
+        model_name: str,
+        checker: DataChecker,
+) -> str:
+    """
+    Check the model data using Evidently.
+    """
+    # Get the newest data from the database
+    df = load_model_data()
+    # Get the training dataset
+    ref_df = get_training_dataset(model_name=model_name)
+    # Check the data
+    return checker.check_data(df, ref_df, tags=[model_name])