celpri commited on
Commit
4b22893
·
1 Parent(s): 4b2bde0

Notebooks DataDrift

Browse files
data_drift_analysis.ipynb DELETED
@@ -1,159 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "57a500a1",
6
- "metadata": {},
7
- "source": [
8
- "Charger le dataset initial"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": 1,
14
- "id": "265ff33b",
15
- "metadata": {},
16
- "outputs": [],
17
- "source": [
18
- "import pandas as pd\n",
19
- "from sklearn.model_selection import train_test_split\n",
20
- "\n",
21
- "df = pd.read_csv(\"Data/features_clients.csv\")\n",
22
- "df = df.drop(columns=[\"SK_ID_CURR\"])"
23
- ]
24
- },
25
- {
26
- "cell_type": "markdown",
27
- "id": "55f5c7f9",
28
- "metadata": {},
29
- "source": [
30
- "Train/Test Split"
31
- ]
32
- },
33
- {
34
- "cell_type": "code",
35
- "execution_count": 2,
36
- "id": "33025b1c",
37
- "metadata": {},
38
- "outputs": [],
39
- "source": [
40
- "df_train, df_test = train_test_split(\n",
41
- " df,\n",
42
- " test_size=0.3,\n",
43
- " random_state=42\n",
44
- ")"
45
- ]
46
- },
47
- {
48
- "cell_type": "code",
49
- "execution_count": 8,
50
- "id": "ee84412a",
51
- "metadata": {},
52
- "outputs": [
53
- {
54
- "name": "stdout",
55
- "output_type": "stream",
56
- "text": [
57
- "0.7.20\n",
58
- "c:\\Users\\User\\Desktop\\Formation IA\\projet8\\projet8\\Lib\\site-packages\\evidently\\__init__.py\n"
59
- ]
60
- }
61
- ],
62
- "source": [
63
- "import evidently\n",
64
- "print(evidently.__version__)\n",
65
- "print(evidently.__file__)"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": 14,
71
- "id": "dc5d67c4",
72
- "metadata": {},
73
- "outputs": [
74
- {
75
- "name": "stdout",
76
- "output_type": "stream",
77
- "text": [
78
- "['AbsMaxError', 'Accuracy', 'AlmostConstantColumnsCount', 'AlmostDuplicatedColumnsCount', 'CategoryCount', 'ColumnCorrelationMatrix', 'ColumnCorrelations', 'ColumnCount', 'ConstantColumnsCount', 'CorrelationMatrix', 'DatasetCorrelations', 'DatasetMissingValueCount', 'Diversity', 'DriftedColumnsCount', 'DummyAccuracy', 'DummyF1Score', 'DummyFNR', 'DummyFPR', 'DummyLogLoss', 'DummyMAE', 'DummyMAPE', 'DummyPrecision', 'DummyRMSE', 'DummyRecall', 'DummyRocAuc', 'DummyTNR', 'DummyTPR', 'DuplicatedColumnsCount', 'DuplicatedRowCount', 'EmptyColumnsCount', 'EmptyRowsCount', 'F1ByLabel', 'F1Score', 'FBetaTopK', 'FNR', 'FPR', 'GroupBy', 'HitRate', 'InListValueCount', 'InRangeValueCount', 'ItemBias', 'LogLoss', 'MAE', 'MAP', 'MAPE', 'MRR', 'MaxValue', 'MeanError', 'MeanValue', 'MedianValue', 'MinValue', 'MissingValueCount', 'NDCG', 'Novelty', 'OutListValueCount', 'OutRangeValueCount', 'Personalization', 'PopularityBiasMetric', 'Precision', 'PrecisionByLabel', 'PrecisionTopK', 'QuantileValue', 'R2Score', 'RMSE', 'RecCasesTable', 'Recall', 'RecallByLabel', 'RecallTopK', 'RocAuc', 'RocAucByLabel', 'RowCount', 'RowTestSummary', 'ScoreDistribution', 'Serendipity', 'StdValue', 'SumValue', 'TNR', 'TPR', 'UniqueValueCount', 'UserBias', 'ValueDrift', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_legacy', 'classification', 'column_statistics', 'data_quality', 'dataset_statistics', 'group_by', 'recsys', 'regression', 'row_test_summary']\n"
79
- ]
80
- }
81
- ],
82
- "source": [
83
- "import evidently.metrics\n",
84
- "print(dir(evidently.metrics))"
85
- ]
86
- },
87
- {
88
- "cell_type": "markdown",
89
- "id": "4a4bba44",
90
- "metadata": {},
91
- "source": [
92
- "Lancer Evidently"
93
- ]
94
- },
95
- {
96
- "cell_type": "code",
97
- "execution_count": null,
98
- "id": "ba976620",
99
- "metadata": {},
100
- "outputs": [
101
- {
102
- "ename": "AttributeError",
103
- "evalue": "'Snapshot' object has no attribute 'save'",
104
- "output_type": "error",
105
- "traceback": [
106
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
107
- "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
108
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[16]\u001b[39m\u001b[32m, line 21\u001b[39m\n\u001b[32m 14\u001b[39m report.run(reference_data=reference, current_data=current)\n\u001b[32m 16\u001b[39m snapshot = report.run(\n\u001b[32m 17\u001b[39m reference_data=reference,\n\u001b[32m 18\u001b[39m current_data=current\n\u001b[32m 19\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m21\u001b[39m \u001b[43msnapshot\u001b[49m\u001b[43m.\u001b[49m\u001b[43msave\u001b[49m(\u001b[33m\"\u001b[39m\u001b[33mdata_drift_report.html\u001b[39m\u001b[33m\"\u001b[39m)\n",
109
- "\u001b[31mAttributeError\u001b[39m: 'Snapshot' object has no attribute 'save'"
110
- ]
111
- }
112
- ],
113
- "source": [
114
- "from evidently import Report, Dataset\n",
115
- "from evidently.metrics import ValueDrift\n",
116
- "\n",
117
- "reference = Dataset.from_pandas(df_train)\n",
118
- "current = Dataset.from_pandas(df_test)\n",
119
- "\n",
120
- "metrics = []\n",
121
- "\n",
122
- "for col in df_train.columns:\n",
123
- " metrics.append(ValueDrift(column=col))\n",
124
- "\n",
125
- "report = Report(metrics)\n",
126
- "\n",
127
- "report.run(reference_data=reference, current_data=current)\n",
128
- "\n",
129
- "snapshot = report.run(\n",
130
- " reference_data=reference,\n",
131
- " current_data=current\n",
132
- ")\n",
133
- "\n",
134
- "snapshot.save_html(\"data_drift_report.html\")"
135
- ]
136
- }
137
- ],
138
- "metadata": {
139
- "kernelspec": {
140
- "display_name": "projet8 (3.12.10)",
141
- "language": "python",
142
- "name": "python3"
143
- },
144
- "language_info": {
145
- "codemirror_mode": {
146
- "name": "ipython",
147
- "version": 3
148
- },
149
- "file_extension": ".py",
150
- "mimetype": "text/x-python",
151
- "name": "python",
152
- "nbconvert_exporter": "python",
153
- "pygments_lexer": "ipython3",
154
- "version": "3.12.10"
155
- }
156
- },
157
- "nbformat": 4,
158
- "nbformat_minor": 5
159
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monitoring/data_drift_report.html ADDED
The diff for this file is too large to render. See raw diff
 
monitoring/drif_analysis.ipynb ADDED
File without changes
tests/fonctionnal/test_api.py CHANGED
@@ -6,13 +6,13 @@ from contextlib import asynccontextmanager
6
  from src.api.main import app
7
 
8
 
9
- # ---- Neutralise le lifespan ----
10
  @asynccontextmanager
11
  async def empty_lifespan(app):
12
  yield
13
 
14
  app.router.lifespan_context = empty_lifespan
15
- # --------------------------------
16
 
17
 
18
  class DummyModel:
@@ -23,10 +23,7 @@ class DummyModel:
23
  def get_client():
24
  app.state.model = DummyModel()
25
  app.state.features = pd.DataFrame({
26
- "SK_ID_CURR": [100002],
27
- "feature_1": [0.5],
28
- "feature_2": [1.2],
29
- })
30
  return TestClient(app)
31
 
32
 
 
6
  from src.api.main import app
7
 
8
 
9
+ # Neutralise le lifespan
10
  @asynccontextmanager
11
  async def empty_lifespan(app):
12
  yield
13
 
14
  app.router.lifespan_context = empty_lifespan
15
+
16
 
17
 
18
  class DummyModel:
 
23
  def get_client():
24
  app.state.model = DummyModel()
25
  app.state.features = pd.DataFrame({
26
+ "SK_ID_CURR": [100002],})
 
 
 
27
  return TestClient(app)
28
 
29
 
tests/unit/test_input_validation.py CHANGED
@@ -2,7 +2,6 @@ import pytest
2
 
3
 
4
  def validate_input(age: int, income: float):
5
- # remplace par ta vraie fonction si elle existe
6
  if age < 0:
7
  raise ValueError("Age must be positive")
8
  if income <= 0:
 
2
 
3
 
4
  def validate_input(age: int, income: float):
 
5
  if age < 0:
6
  raise ValueError("Age must be positive")
7
  if income <= 0:
tests/unit/test_model_loading.py CHANGED
@@ -4,7 +4,7 @@ from unittest.mock import patch, MagicMock
4
  from src.model.model import load_model
5
 
6
 
7
- # 1️⃣ HF OK
8
  @patch("src.model.model.joblib.load")
9
  @patch("src.model.model.hf_hub_download")
10
  def test_load_model_from_hf(mock_hf, mock_joblib):
@@ -16,7 +16,7 @@ def test_load_model_from_hf(mock_hf, mock_joblib):
16
  assert model == "MODEL"
17
 
18
 
19
- # 2️⃣ HF échoue → MLflow OK
20
  @patch("src.model.model.hf_hub_download", side_effect=Exception("HF fail"))
21
  @patch("mlflow.sklearn.load_model")
22
  def test_load_model_fallback_mlflow(mock_mlflow, mock_hf):
@@ -27,7 +27,7 @@ def test_load_model_fallback_mlflow(mock_mlflow, mock_hf):
27
  assert model == "MLFLOW_MODEL"
28
 
29
 
30
- # 3️⃣ Tout échoue → FileNotFoundError
31
  @patch("src.model.model.hf_hub_download", side_effect=Exception("HF fail"))
32
  @patch("mlflow.sklearn.load_model", side_effect=Exception("MLflow fail"))
33
  def test_load_model_raises_error(mock_mlflow, mock_hf):
 
4
  from src.model.model import load_model
5
 
6
 
7
+ # HF OK
8
  @patch("src.model.model.joblib.load")
9
  @patch("src.model.model.hf_hub_download")
10
  def test_load_model_from_hf(mock_hf, mock_joblib):
 
16
  assert model == "MODEL"
17
 
18
 
19
+ # HF échoue → MLflow OK
20
  @patch("src.model.model.hf_hub_download", side_effect=Exception("HF fail"))
21
  @patch("mlflow.sklearn.load_model")
22
  def test_load_model_fallback_mlflow(mock_mlflow, mock_hf):
 
27
  assert model == "MLFLOW_MODEL"
28
 
29
 
30
+ # Tout échoue → FileNotFoundError
31
  @patch("src.model.model.hf_hub_download", side_effect=Exception("HF fail"))
32
  @patch("mlflow.sklearn.load_model", side_effect=Exception("MLflow fail"))
33
  def test_load_model_raises_error(mock_mlflow, mock_hf):
tests/unit/test_preprocessing.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
 
4
 
5
  def dummy_preprocess(df: pd.DataFrame):
6
- # remplace par ta vraie fonction si elle existe
7
  return df.fillna(0)
8
 
9
 
 
3
 
4
 
5
  def dummy_preprocess(df: pd.DataFrame):
6
+
7
  return df.fillna(0)
8
 
9