Spaces:

SlimG
/

tennis-api

Running

App Files Files Community

sghorbal commited on Apr 15, 2025

Commit

7effb2a

0 Parent(s):

initial commit

Browse files

Files changed (19) hide show

.env.example +16 -0
.gitattributes +1 -0
.gitignore +177 -0
Dockerfile +26 -0
Jenkinsfile +79 -0
LICENSE +21 -0
README.md +44 -0
api.png +3 -0
entrypoint.sh +4 -0
requirements.txt +10 -0
src/__init__.py +0 -0
src/enums.py +38 -0
src/main.py +222 -0
src/model.py +310 -0
src/sql.py +73 -0
tests/__init__.py +0 -0
tests/conftest.py +39 -0
tests/test_enums.py +21 -0
tests/test_model.py +23 -0

.env.example ADDED Viewed

	@@ -0,0 +1,16 @@

+# PostgreSQL database connection information
+PG_USER=
+PG_PASSWORD=
+PG_HOST=
+PG_PORT=
+PG_DB=
+# One among disable, allow, prefer, require, verify-ca and verify-full
+PG_SSLMODE=
+# If set, protects the API from unauthorized called
+FASTAPI_API_KEY=
+MLFLOW_SERVER_URI=
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ api.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,177 @@

+data/*
+**/*.ipynb
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+.DS_Store
+*.bck

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM tiangolo/uvicorn-gunicorn:python3.11
+COPY ./requirements.txt /tmp/requirements.txt
+COPY ./entrypoint.sh /tmp/entrypoint.sh
+COPY ./src /app/src
+COPY ./tests /app/tests
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+WORKDIR /app
+ENV PYTHONPATH=/app
+# Port to expose
+EXPOSE 7860
+# Health Check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+  CMD [ "curl", "-f", "http://localhost:7860/check_health" ]
+# Create a non-root user 'appuser' and switch to this user
+RUN useradd --create-home appuser
+USER appuser
+# CMD with JSON notation
+CMD ["/tmp/entrypoint.sh"]

Jenkinsfile ADDED Viewed

	@@ -0,0 +1,79 @@

+pipeline {
+    agent any
+    stages {
+        stage('Checkout') {
+            steps {
+                // Checkout the code from the repository
+                git branch: 'master', url: 'https://github.com/slim-git/tennis-api/'
+            }
+        }
+        stage('Build Docker Image') {
+            steps {
+                script {
+                    // Build the Docker image using the Dockerfile
+                    sh 'docker build -t tennis_api .'
+                }
+            }
+        }
+        stage('Run Tests Inside Docker Container') {
+            steps {
+                withCredentials([
+                    string(credentialsId: 'MLFLOW_SERVER_URI', variable: 'MLFLOW_SERVER_URI'),
+                    string(credentialsId: 'AWS_ACCESS_KEY_ID', variable: 'AWS_ACCESS_KEY_ID'),
+                    string(credentialsId: 'AWS_SECRET_ACCESS_KEY', variable: 'AWS_SECRET_ACCESS_KEY'),
+                    string(credentialsId: 'PG_USER', variable: 'PG_USER'),
+                    string(credentialsId: 'PG_PASSWORD', variable: 'PG_PASSWORD'),
+                    string(credentialsId: 'PG_HOST', variable: 'PG_HOST'),
+                    string(credentialsId: 'PG_PORT', variable: 'PG_PORT'),
+                    string(credentialsId: 'PG_DB', variable: 'PG_DB'),
+                    string(credentialsId: 'PG_SSLMODE', variable: 'PG_SSLMODE')
+                ]) {
+                    // Write environment variables to a temporary file
+                    // KEEP SINGLE QUOTE FOR SECURITY PURPOSES (MORE INFO HERE: https://www.jenkins.io/doc/book/pipeline/jenkinsfile/#handling-credentials)
+                    script {
+                        writeFile file: 'env.list', text: '''
+                        MLFLOW_SERVER_URI=${MLFLOW_SERVER_URI}
+                        AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+                        AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+                        PG_USER=${PG_USER}
+                        PG_PASSWORD=${PG_PASSWORD}
+                        PG_HOST=${PG_HOST}
+                        PG_PORT=${PG_PORT}
+                        PG_DB=${PG_DB}
+                        PG_SSLMODE=${PG_SSLMODE}
+                        '''
+                    }
+                    // Run a temporary Docker container and pass env variables securely via --env-file
+                    sh '''
+                    docker run --rm --env-file env.list \
+                    tennis_api \
+                    bash -c "pytest --maxfail=1 --disable-warnings"
+                    '''
+                }
+            }
+        }
+    }
+    post {
+        always {
+            // Clean up workspace and remove dangling Docker images
+            sh 'docker system prune -f'
+        }
+        success {
+            withCredentials([
+                string(credentialsId: 'HF_USERNAME', variable: 'HF_USERNAME'),
+                string(credentialsId: 'HF_TOKEN', variable: 'HF_TOKEN')
+            ]) {
+                echo 'Pipeline completed successfully! Pushing to huggingFace'
+                sh 'git push --force https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/tennis-api master:main'
+            }
+        }
+        failure {
+            echo 'Pipeline failed. Check logs for errors.'
+        }
+    }
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 slim-git
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+---
+title: Tennis Api
+emoji: ⚡
+colorFrom: purple
+colorTo: yellow
+sdk: docker
+pinned: false
+short_description: API for training and interacting with tennis-insights models
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# tennis-api
+## Docker Install
+To get the service for training the model and giving predictions up and running locally, simply follow the steps hereafter:
+### Build the API image
+From the root of the project:
+```bash
+$> docker build . -t tennis_api:latest -f Dockerfile
+```
+### Run it
+From the root of the project:
+```bash
+$> docker run --rm -p 7860:7860 --mount type=bind,src=./.env,target=/app/.env tennis_api:latest
+```
+Then go to [http://localhost:7860/](http://localhost:7860/)
+The API should be accessible:
+![exposed API methods](api.png)
+## Resources
+Website: [http://www.tennis-data.co.uk/alldata.php](http://www.tennis-data.co.uk/alldata.php)
+## License
+©2025

api.png ADDED Viewed

Git LFS Details

SHA256: 58132cdf8349fbb8e627477aa6243b7678eb3249dc24dab210a9723dcf0691b8
Pointer size: 131 Bytes
Size of remote file: 247 kB

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+# Run the API
+uvicorn src.main:app --host 0.0.0.0 --port 7860

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+python-dotenv
+fastapi
+psycopg2-binary
+pandas
+scikit-learn
+openpyxl
+xlrd >= 2.0.1
+mlflow
+boto3
+pytest

src/__init__.py ADDED Viewed

File without changes

src/enums.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from enum import Enum
+from typing import List, Literal
+class Feature(Enum):
+    _name: str
+    _type: Literal['category', 'number']
+    SERIES = ('Series', 'category')
+    SURFACE = ('Surface', 'category')
+    COURT = ('Court', 'category')
+    ROUND = ('Round', 'category')
+    DIFF_RANKING = ('diffRanking', 'number')
+    DIFF_POINTS = ('diffPoints', 'number')
+    def __new__(cls, name: str, type: Literal['category', 'number']):
+        obj = object.__new__(cls)
+        obj._value_ = name
+        obj._name = name
+        obj._type = type
+        return obj
+    @property
+    def name(self):
+        return self._name
+    @property
+    def type(self):
+        return self._type
+    @classmethod
+    def get_features_by_type(cls, type: Literal['category', 'number']) -> List['Feature']:
+        return [feature for feature in cls if feature.type == type]
+    @classmethod
+    def get_all_features(cls) -> List['Feature']:
+        return [feature for feature in cls]

src/main.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import os
+import joblib
+import logging
+import secrets
+from typing import Literal, Optional, Annotated
+from datetime import datetime
+from fastapi import (
+    FastAPI,
+    Request,
+    HTTPException,
+    Query,
+    Security,
+    Depends
+)
+from fastapi.background import BackgroundTasks
+from fastapi.responses import RedirectResponse
+from fastapi.security.api_key import APIKeyHeader
+from pydantic import BaseModel, Field
+from starlette.status import HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND
+from dotenv import load_dotenv
+from mlflow.exceptions import RestException
+from src.model import (
+    run_experiment,
+    train_model_from_scratch,
+    predict,
+    list_registered_models,
+    load_model
+)
+from src.sql import (
+    _get_connection,
+    list_tournaments as _list_tournaments,
+)
+# ------------------------------------------------------------------------------
+load_dotenv()
+FASTAPI_API_KEY = os.getenv("FASTAPI_API_KEY")
+safe_clients = ['127.0.0.1']
+api_key_header = APIKeyHeader(name='Authorization', auto_error=False)
+async def validate_api_key(request: Request, key: str = Security(api_key_header)):
+    '''
+    Check if the API key is valid
+    Args:
+        key (str): The API key to check
+    Raises:
+        HTTPException: If the API key is invalid
+    '''
+    if request.client.host not in safe_clients and not secrets.compare_digest(str(key), str(FASTAPI_API_KEY)):
+        raise HTTPException(
+            status_code=HTTP_403_FORBIDDEN, detail="Unauthorized - API Key is wrong"
+        )
+    return None
+app = FastAPI(dependencies=[Depends(validate_api_key)] if FASTAPI_API_KEY else None,
+              title="Tennis Insights API")
+# ------------------------------------------------------------------------------
+@app.get("/", include_in_schema=False)
+def redirect_to_docs():
+    '''
+    Redirect to the API documentation.
+    '''
+    return RedirectResponse(url='/docs')
+@app.get("/train_model", tags=["model"], deprecated=True)
+async def train_model(
+    background_tasks: BackgroundTasks,
+    circuit: Literal["atp", "wta"] = 'atp',
+    from_date: str = "2024-01-01",
+    to_date: str = "2024-12-31"):
+    """
+    Train the model
+    """
+    # Check dates format
+    try:
+        datetime.strptime(from_date, "%Y-%m-%d")
+        datetime.strptime(to_date, "%Y-%m-%d")
+    except ValueError:
+        return {"message": "Invalid date format. Please use the format 'YYYY-MM-DD'"}
+    background_tasks.add_task(
+        func=train_model_from_scratch,
+        circuit=circuit,
+        from_date=from_date,
+        to_date=to_date)
+    return {"message": "Model training in progress"}
+@app.get("/run_experiment", tags=["model"], description="Schedule a run of the ML experiment")
+async def run_xp(
+    background_tasks: BackgroundTasks,
+    circuit: Literal["atp", "wta"] = 'atp',
+    from_date: str = "2024-01-01",
+    to_date: str = "2024-12-31"):
+    """
+    Train the model
+    """
+    # Check dates format
+    try:
+        datetime.strptime(from_date, "%Y-%m-%d")
+        datetime.strptime(to_date, "%Y-%m-%d")
+    except ValueError:
+        return {"message": "Invalid date format. Please use the format 'YYYY-MM-DD'"}
+    background_tasks.add_task(
+        func=run_experiment,
+        circuit=circuit,
+        from_date=from_date,
+        to_date=to_date)
+    return {"message": "Experiment scheduled"}
+class ModelInput(BaseModel):
+    rank_player_1: int = Field(gt=0, default=1, description="The rank of the 1st player")
+    rank_player_2: int = Field(gt=0, default=100, description="The rank of the 2nd player")
+    points_player_1: int = Field(gt=0, default=4000, description="The number of points of the 1st player")
+    points_player_2: int = Field(gt=0, default=500, description="The number of points of the 2nd player")
+    court: Literal['Outdoor', 'Indoor'] = 'Outdoor'
+    surface: Literal['Grass', 'Carpet', 'Clay', 'Hard'] = 'Clay'
+    round: Literal['1st Round', '2nd Round', '3nd Round', '4th Round', 'Quarterfinals', 'Semifinals', 'The Final', 'Round Robin'] = '1st Round'
+    series: Literal['Grand Slam', 'Masters 1000', 'Masters', 'Masters Cup', 'ATP500', 'ATP250', 'International Gold', 'International'] = 'Grand Slam'
+    model: Optional[str] = 'LogisticRegression'
+    version: Optional[str] = 'latest'
+class ModelOutput(BaseModel):
+    result: int = Field(description="The prediction result. 1 if player 1 is expected to win, 0 otherwise.", example=1)
+    prob: list[float] = Field(description="Probability of [defeat, victory] of player 1.", example=[0.15, 0.85])
+@app.get("/predict",
+         tags=["model"],
+         description="Predict the outcome of a tennis match",
+         response_model=ModelOutput)
+async def make_prediction(params: Annotated[ModelInput, Query()]):
+    """
+    Predict the matches
+    """
+    if not params.model:
+        # check the presence of 'model.pkl' file in data/
+        if not os.path.exists("/data/model.pkl"):
+            return {"message": "Model not trained. Please train the model first."}
+        # Load the model
+        pipeline = joblib.load("/data/model.pkl")
+    else:
+        # Get the model info
+        try:
+            pipeline = load_model(params.model, params.version)
+        except RestException as e:
+            logging.error(e)
+            # Return HTTP error 404
+            return HTTPException(
+                status=HTTP_404_NOT_FOUND,
+                detail=f"Model {params.model} not found"
+            )
+    # Make the prediction
+    prediction = predict(
+        pipeline=pipeline,
+        rank_player_1=params.rank_player_1,
+        rank_player_2=params.rank_player_2,
+        points_player_1=params.points_player_1,
+        points_player_2=params.points_player_2,
+        court=params.court,
+        surface=params.surface,
+        round_stage=params.round,
+        series=params.series
+    )
+    logging.info(prediction)
+    return prediction
+@app.get("/list_available_models", tags=["model"], description="List the available models")
+async def list_available_models():
+    """
+    List the available models
+    """
+    return list_registered_models()
+class Tournament(BaseModel):
+    name: str = Field(description="The tournament's name.", example='Wimbledon')
+    series: Literal['ATP250', 'ATP500', 'Grand Slam', 'Masters 1000', 'Masters', 'Masters Cup', 'International Gold', 'International'] = 'Grand Slam'
+    court: Literal['Outdoor', 'Indoor'] = 'Outdoor'
+    surface: Literal['Grass', 'Carpet', 'Clay', 'Hard'] = 'Grass'
+@app.get("/{circuit}/tournaments", tags=["reference"], description="List the tournaments of the circuit", response_model=list[Tournament])
+async def list_tournaments(circuit: Literal["atp", "wta"]):
+    """
+    List the tournaments of the circuit
+    """
+    return _list_tournaments(circuit)
+@app.get("/check_health", tags=["general"], description="Check the health of the API")
+async def check_health():
+    """
+    Check all the services in the infrastructure are working
+    """
+    healthy = 0
+    unhealthy = 1
+    # DB check
+    db_status = False
+    try:
+        with _get_connection() as conn:
+            with conn.cursor() as cursor:
+                cursor.execute("SELECT 1")
+                db_status = True
+    except Exception:
+        pass
+    if db_status:
+        return healthy
+    else:
+        return unhealthy

src/model.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import os
+import time
+import joblib
+import logging
+import pandas as pd
+from dotenv import load_dotenv
+from typing import Literal, Any, Tuple, Dict, List
+import mlflow
+from mlflow.models import infer_signature
+from mlflow.tracking import MlflowClient
+from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
+from src.sql import load_matches_from_postgres
+from src.enums import Feature
+load_dotenv()
+models = {}
+def create_pairwise_data(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Creates a balanced dataset with pairwise comparisons
+    """
+    records = []
+    for _, row in df.iterrows():
+        # Record 1 : original order (winner in position 1, loser in position 2)
+        record_1 = {
+            Feature.SERIES.name: row['series'],
+            Feature.SURFACE.name: row['surface'],
+            Feature.COURT.name: row['court'],
+            Feature.ROUND.name: row['round'],
+            Feature.DIFF_RANKING.name: row['w_rank'] - row['l_rank'], # rank difference
+            Feature.DIFF_POINTS.name: row['w_points'] - row['l_points'], # points difference
+            'target': 1 # Player in first position won
+        }
+        # Record 2 : invert players
+        record_2 = record_1.copy()
+        record_2[Feature.DIFF_RANKING.name] = -record_2['diffRanking'] # Invert the ranking difference
+        record_2[Feature.DIFF_POINTS.name] = -record_2['diffPoints'] # Invert the points difference
+        record_2['target'] = 0 # Player in first position lost
+        records.append(record_1)
+        records.append(record_2)
+    return pd.DataFrame(records)
+def create_pipeline() -> Pipeline:
+    """
+    Creates a machine learning pipeline with SimpleImputer, StandardScaler, OneHotEncoder and LogisticRegression.
+    Returns:
+        Pipeline: A scikit-learn pipeline object.
+    """
+    # Define the features, numerical and categorical
+    cat_features = [f.name for f in Feature.get_features_by_type('category')]
+    num_features = [f.name for f in Feature.get_features_by_type('number')]
+    # Pipeline for numerical variables
+    num_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='mean')),
+        ('scaler', StandardScaler())
+    ])
+    # Pipeline for categorical variables
+    cat_transformer = OneHotEncoder(handle_unknown='ignore')
+    # Preprocessor
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', num_transformer, num_features),
+            ('cat', cat_transformer, cat_features)
+        ]
+    )
+    # Full pipeline
+    pipeline = Pipeline(steps=[
+        ('preprocessor', preprocessor),
+        ('classifier', LogisticRegression(solver='lbfgs', max_iter=1000))
+    ])
+    return pipeline
+def train_model_from_scratch(
+        circuit: Literal['atp', 'wta'],
+        from_date: str,
+        to_date: str,
+        output_path: str = '/data/model.pkl') -> Pipeline:
+    """
+    Train a model from scratch
+    """
+    # Load data
+    data = load_matches_from_postgres(
+        table_name=f"{circuit}_data",
+        from_date=from_date,
+        to_date=to_date)
+    # Train the model
+    pipeline = create_and_train_model(data)
+    # Save the model
+    joblib.dump(pipeline, output_path)
+    return pipeline
+def create_and_train_model(data: pd.DataFrame) -> Pipeline:
+    """
+    Create and train a model on the given data
+    """
+    # Split the data
+    X_train, _, y_train, _ = preprocess_data(data)
+    # Train the model
+    pipeline = create_pipeline()
+    pipeline = train_model(pipeline, X_train, y_train)
+    return pipeline
+def train_model(
+        pipeline: Pipeline,
+        X_train: pd.DataFrame,
+        y_train: pd.DataFrame) -> Pipeline:
+    """
+    Train the pipeline
+    """
+    pipeline.fit(X_train, y_train)
+    return pipeline
+def preprocess_data(df: pd.DataFrame) -> Tuple:
+    """
+    Split the dataframe into X (features) and y (target).
+    Args:
+        df (pd.DataFrame): Input dataframe.
+    Returns:
+        Tuple: Split data (X_train, X_test, y_train, y_test).
+    """
+    # Format data for the model
+    df_model = create_pairwise_data(df)
+    features = [f.name for f in Feature.get_all_features()]
+    X = df_model[features]
+    y = df_model['target']
+    # Split the data
+    return train_test_split(X, y, test_size=0.2)
+def evaluate_model(pipeline: Pipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
+    """
+    Evaluates the model
+    """
+    y_pred = pipeline.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
+    cm = confusion_matrix(y_test, y_pred)
+    return {
+        "accuracy": accuracy,
+        "roc_auc": roc_auc,
+        "confusion_matrix": cm
+    }
+def predict(
+    pipeline: Pipeline,
+    series: str,
+    surface: str,
+    court: str,
+    round_stage: str,
+    rank_player_1: int,
+    rank_player_2: int,
+    points_player_1: int,
+    points_player_2: int
+) -> Dict[str, Any]:
+    diffRanking = rank_player_1 - rank_player_2
+    diffPoints = points_player_1 - points_player_2
+    # Built a DataFrame with the new match
+    new_match = pd.DataFrame([{
+        Feature.SERIES.name: series,
+        Feature.SURFACE.name: surface,
+        Feature.COURT.name: court,
+        Feature.ROUND.name: round_stage,
+        Feature.DIFF_RANKING.name: diffRanking,
+        Feature.DIFF_POINTS.name: diffPoints
+    }])
+    # Use the pipeline to make a prediction
+    prediction = pipeline.predict(new_match)[0]
+    proba = pipeline.predict_proba(new_match)[0]
+    # Print the result
+    logging.info("\n--- 📊 Result ---")
+    logging.info(f"🏆 Win probability : {proba[1]:.2f}")
+    logging.info(f"❌ Lose probability : {proba[0]:.2f}")
+    logging.info(f"🎾 Prediction : {'Victory' if prediction == 1 else 'Loss'}")
+    return {"result": prediction.item(), "prob": [p.item() for p in proba]}
+def run_experiment(
+        circuit: Literal['atp', 'wta'],
+        from_date: str,
+        to_date: str,
+        artifact_path: str = None,
+        registered_model_name: str = 'LogisticRegression',
+        experiment_name: str = 'Logistic Tennis Prediction',
+        ):
+    """
+    Run the entire ML experiment pipeline.
+    Args:
+        experiment_name (str): Name of the MLflow experiment.
+        data_url (str): URL to load the dataset.
+        artifact_path (str): Path to store the model artifact.
+        registered_model_name (str): Name to register the model under in MLflow.
+    """
+    if not artifact_path:
+        artifact_path = f'{circuit}_model'
+    # Set tracking URI to your mlflow application
+    mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
+    # Start timing
+    start_time = time.time()
+    # Load and preprocess data
+    df = load_matches_from_postgres(
+        table_name=f"{circuit}_data",
+        from_date=from_date,
+        to_date=to_date)
+    X_train, X_test, y_train, y_test = preprocess_data(df)
+    # Create pipeline
+    pipe = create_pipeline()
+    # Set experiment's info
+    mlflow.set_experiment(experiment_name)
+    # Get our experiment info
+    experiment = mlflow.get_experiment_by_name(experiment_name)
+    # Call mlflow autolog
+    mlflow.sklearn.autolog()
+    with mlflow.start_run(experiment_id=experiment.experiment_id):
+        # Train model
+        train_model(pipe, X_train, y_train)
+        # Store metrics
+        # predicted_output = pipe.predict(X_test.values)
+        accuracy = pipe.score(X_test, y_test)
+        # Print results
+        logging.info("LogisticRegression model")
+        logging.info("Accuracy: {}".format(accuracy))
+        signature = infer_signature(X_test, pipe.predict(X_test))
+        mlflow.sklearn.log_model(
+            sk_model=pipe,
+            artifact_path=artifact_path,
+            registered_model_name=registered_model_name,
+            signature=signature
+        )
+    # Print timing
+    logging.info(f"...Training Done! --- Total training time: {time.time() - start_time} seconds")
+def list_registered_models() -> List[Dict]:
+    """
+    List all the registered models
+    """
+    # Set tracking URI to your Heroku application
+    mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
+    # Return the list of registered models
+    results = mlflow.search_registered_models()
+    output = []
+    for res in results:
+        for mv in res.latest_versions:
+            output.append({"name": mv.name, "run_id": mv.run_id, "version": mv.version})
+    return output
+def load_model(name: str, version: str = 'latest') -> Pipeline:
+    """
+    Load a model from MLflow
+    """
+    if name in models.keys():
+        return models[name]
+    mlflow.set_tracking_uri(os.environ["MLFLOW_SERVER_URI"])
+    client = MlflowClient()
+    model_info = client.get_registered_model(name)
+    # Load the model
+    pipeline = mlflow.sklearn.load_model(model_uri=model_info.latest_versions[0].source)
+    logging.info(f'Model {name} loaded')
+    models[name] = pipeline
+    return pipeline

src/sql.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import pandas as pd
+import psycopg2
+from typing import Literal
+from datetime import datetime
+from dotenv import load_dotenv
+import os
+load_dotenv()
+PG_USER = os.getenv("PG_USER")
+PG_PASSWORD = os.getenv("PG_PASSWORD")
+PG_HOST = os.getenv("PG_HOST")
+PG_PORT = os.getenv("PG_PORT")
+PG_DB = os.getenv("PG_DB")
+PG_SSLMODE = os.getenv("PG_SSLMODE")
+def _get_connection() -> psycopg2.extensions.connection:
+    """
+    Get a connection to the Postgres database
+    """
+    conn = psycopg2.connect(
+        dbname=PG_DB,
+        user=PG_USER,
+        password=PG_PASSWORD,
+        host=PG_HOST,
+        port=PG_PORT,
+        sslmode=PG_SSLMODE,
+    )
+    return conn
+def load_matches_from_postgres(
+        table_name: Literal['atp_data', 'wta_data'],
+        from_date: str = None,
+        to_date: str = None) -> pd.DataFrame:
+    """
+    Load data from Postgres
+    """
+    if not to_date:
+        to_date = datetime.now().strftime("%Y-%m-%d")
+    if not from_date:
+        from_date = "1900-01-01"
+    query = f"SELECT * FROM {table_name} WHERE date BETWEEN %s AND %s"
+    vars = [from_date, to_date]
+    with _get_connection() as conn:
+        with conn.cursor() as cursor:
+            cursor.execute(query, vars)
+            data = cursor.fetchall()
+    data = pd.DataFrame(data, columns=[desc[0] for desc in cursor.description])
+    return data
+def list_tournaments(circuit: Literal["atp", "wta"]):
+    """
+    List the tournaments of the circuit
+    """
+    query = f"""
+        SELECT DISTINCT
+            tournament as name,
+            series,
+            court,
+            surface
+        FROM {circuit}_data;
+    """
+    with _get_connection() as conn:
+        with conn.cursor() as cursor:
+            cursor.execute(query)
+            tournaments = [{'name': row[0], 'series': row[1], 'court': row[2], 'surface': row[3]} for row in cursor.fetchall()]
+    return tournaments

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pytest
+import pandas as pd
+@pytest.fixture
+def simple_match():
+    return pd.DataFrame({
+        'series': ['ATP250',],
+        'surface': ['Clay',],
+        'court': ['Indoor',],
+        'round': ['Round Robin',],
+        'w_rank': [5],
+        'l_rank': [300],
+        'w_points': [2000],
+        'l_points': [40],
+    })
+@pytest.fixture
+def simple_match_pairwise_data(simple_match: pd.DataFrame):
+    return pd.DataFrame({
+        'Series': ['ATP250', 'ATP250'],
+        'Surface': ['Clay', 'Clay'],
+        'Court': ['Indoor', 'Indoor'],
+        'Round': ['Round Robin', 'Round Robin'],
+        'diffRanking': [-295, 295],
+        'diffPoints': [1960, -1960],
+        'target': [1, 0]
+    })
+@pytest.fixture
+def simple_match_empty():
+    return pd.DataFrame({
+        'Series': [],
+        'Surface': [],
+        'Court': [],
+        'Round': [],
+        'diffRanking': [],
+        'diffPoints': [],
+        'target': []
+    })

tests/test_enums.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from src.enums import Feature
+def test_get_features_by_type():
+    """
+    Test the method Feature.get_features_by_type
+    """
+    features = Feature.get_features_by_type('category')
+    assert len(features) == 4
+    assert all([feature.type == 'category' for feature in features])
+    features = Feature.get_features_by_type('number')
+    assert len(features) == 2
+    assert all([feature.type == 'number' for feature in features])
+def test_get_all_features():
+    """
+    Test the method Feature.get_all_features
+    """
+    features = Feature.get_all_features()
+    assert len(features) == 6

tests/test_model.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from sklearn.pipeline import Pipeline
+from src.model import create_pairwise_data, create_pipeline
+def test_create_pairwise_data(simple_match: pd.DataFrame, simple_match_pairwise_data: pd.DataFrame):
+    result = create_pairwise_data(simple_match)
+    assert set(result.columns) == set(simple_match_pairwise_data.columns), "Columns are different"
+    assert simple_match_pairwise_data.equals(result), "Dataframes are different"
+def test_create_pairwise_data_empty(simple_match_empty: pd.DataFrame):
+    result = create_pairwise_data(simple_match_empty)
+    assert result.empty, "Dataframe is not empty"
+def test_create_pipeline():
+    pipeline = create_pipeline()
+    assert pipeline is not None, "Pipeline is None"
+    assert isinstance(pipeline, Pipeline), "Pipeline is not a Pipeline"
+    assert len(pipeline.named_steps) == 2, "Pipeline has wrong number of steps"
+    assert 'preprocessor' in pipeline.named_steps, "Preprocessor is missing"
+    assert 'classifier' in pipeline.named_steps, "Classifier is missing"