Spaces:

mg643
/

brewmatch

Sleeping

App Files Files Community

sam-packer commited on Apr 12

Commit

ea06981

1 Parent(s): 56785a6

Initial commit

Browse files

Files changed (27) hide show

.gitignore +73 -0
README.md +586 -1
pyproject.toml +49 -0
src/brewmatch/__init__.py +3 -0
src/brewmatch/api/__init__.py +26 -0
src/brewmatch/api/app.py +343 -0
src/brewmatch/api/schemas.py +197 -0
src/brewmatch/config.py +73 -0
src/brewmatch/data/__init__.py +13 -0
src/brewmatch/data/dataset.py +204 -0
src/brewmatch/data/download.py +88 -0
src/brewmatch/data/preprocess.py +311 -0
src/brewmatch/device.py +74 -0
src/brewmatch/evaluate.py +222 -0
src/brewmatch/evaluation/__init__.py +33 -0
src/brewmatch/evaluation/error_analysis.py +586 -0
src/brewmatch/evaluation/metrics.py +401 -0
src/brewmatch/experiment.py +492 -0
src/brewmatch/models/__init__.py +13 -0
src/brewmatch/models/base.py +142 -0
src/brewmatch/models/baseline.py +167 -0
src/brewmatch/models/classical.py +212 -0
src/brewmatch/models/neural.py +431 -0
src/brewmatch/train.py +479 -0
src/brewmatch/tuning.py +550 -0
src/brewmatch/utils.py +181 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,73 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.venv/
+venv/
+ENV/
+# uv
+.python-version
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Jupyter
+.ipynb_checkpoints/
+# Data (downloaded and processed)
+data/raw/
+data/processed/
+# Model checkpoints
+models/checkpoints/
+# Experiment outputs
+experiments/
+# OS
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+logs/
+# Environment variables / secrets
+.env
+.env.*
+*.pem
+kaggle.json
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.nox/
+# mypy
+.mypy_cache/

README.md CHANGED Viewed

	@@ -1 +1,586 @@
1	- # BrewMatch

+# BrewMatch
+A machine learning-powered coffee recommendation system that matches users with coffee beans based on their taste
+preferences. Built for the Computer Vision module project, this system implements three distinct modeling approaches and
+includes a production-ready Flask API.
+## Table of Contents
+- [Overview](#overview)
+- [Installation](#installation)
+- [Quick Start](#quick-start)
+- [Project Structure](#project-structure)
+- [Data Pipeline](#data-pipeline)
+- [Models](#models)
+- [Evaluation](#evaluation)
+- [Experiment: Sensitivity Analysis](#experiment-sensitivity-analysis)
+- [API Reference](#api-reference)
+- [Deployment](#deployment)
+## Overview
+BrewMatch recommends coffee beans by learning taste profile similarities from the Coffee Quality Institute (CQI)dataset.
+Given a user's preferred taste characteristics (aroma, flavor, acidity, body, etc.), the system finds coffees with
+matching profiles.
+### Key Features
+- **Three modeling approaches**: Naive baseline, classical ML (KNN), and deep learning (neural embeddings)
+- **Comprehensive evaluation**: Precision@K, Recall@K, NDCG@K, MSE, MAE
+- **Error analysis**: Identifies mispredictions, patterns, and mitigation strategies
+- **Sensitivity analysis experiment**: Measures performance vs. training set size
+- **Production-ready API**: Flask REST API with validation and error handling
+### Taste Profile Features
+The system uses 9 sensory evaluation scores (0-10 scale):
+| Feature    | Description                                            |
+|------------|--------------------------------------------------------|
+| Aroma      | Scent/fragrance of the coffee                          |
+| Flavor     | Overall taste including sweetness, bitterness, acidity |
+| Aftertaste | Lingering taste after swallowing                       |
+| Acidity    | Brightness and liveliness of taste                     |
+| Body       | Thickness/viscosity of the coffee                      |
+| Balance    | How well flavor components work together               |
+| Uniformity | Consistency from cup to cup                            |
+| Clean Cup  | Absence of off-flavors or defects                      |
+| Sweetness  | Caramel-like, fruity, or floral notes                  |
+## Installation
+### Prerequisites
+- Python 3.13+
+- [uv](https://docs.astral.sh/uv/) package manager
+- GPU (optional): NVIDIA CUDA or Apple Silicon MPS for faster training
+- Kaggle account (for dataset download)
+### Setup
+1. **Clone the repository**
+   ```bash
+   git clone https://github.com/MrinalGoel643/BrewMatch.git
+   cd BrewMatch
+   ```
+2. **Install dependencies**
+   ```bash
+   # CPU-only or Apple Silicon (MPS)
+   uv sync
+   # With NVIDIA CUDA support
+   uv sync --extra cuda
+   ```
+3. **Configure Kaggle credentials**
+   Create `~/.kaggle/kaggle.json` with your API credentials:
+   ```json
+   {"username": "your_username", "key": "your_api_key"}
+   ```
+   Get your API key from [Kaggle Account Settings](https://www.kaggle.com/settings/account).
+## Quick Start
+```bash
+# 1. Download the CQI coffee dataset
+uv run download
+# 2. Preprocess the data
+uv run preprocess
+# 3. Train all models (with default hyperparameters)
+uv run train
+# 4. OR: Tune hyperparameters first, then train (recommended)
+uv run train --tune
+# 5. Evaluate model performance
+uv run evaluate --error-analysis
+# 6. Start the API server
+uv run serve
+```
+## Project Structure
+```
+brewmatch/
+├── pyproject.toml                 # Project config and dependencies
+├── README.md                      # This file
+├── data/
+│   ├── raw/                       # Downloaded CSV files
+│   └── processed/                 # Train/test parquet + scaler
+├── models/
+│   └── checkpoints/               # Saved model files
+├── experiments/                   # Experiment results and plots
+└── src/brewmatch/
+    ├── __init__.py
+    ├── config.py                  # Configuration settings
+    ├── device.py                  # Device detection (CUDA/MPS/CPU)
+    ├── utils.py                   # Utility functions
+    ├── train.py                   # Training script (includes Optuna tuning)
+    ├── evaluate.py                # Evaluation script
+    ├── experiment.py              # Sensitivity analysis
+    ├── data/
+    │   ├── __init__.py
+    │   ├── download.py            # Kaggle dataset downloader
+    │   ├── preprocess.py          # Data cleaning and splitting
+    │   └── dataset.py             # PyTorch Dataset classes
+    ├── models/
+    │   ├── __init__.py
+    │   ├── base.py                # Abstract base class
+    │   ├── baseline.py            # Naive baseline recommender
+    │   ├── classical.py           # KNN/cosine similarity
+    │   └── neural.py              # Neural embedding model
+    ├── evaluation/
+    │   ├── __init__.py
+    │   ├── metrics.py             # Ranking and regression metrics
+    │   └── error_analysis.py      # Error pattern detection
+    └── api/
+        ├── __init__.py
+        ├── app.py                 # Flask application
+        └── schemas.py             # Request validation
+```
+## Data Pipeline
+### Download Dataset
+```bash
+uv run download [--force]
+```
+Downloads the [CQI Coffee Quality Database](https://www.kaggle.com/datasets/volpatto/coffee-quality-database-from-cqi) from Kaggle
+to `data/raw/`. This dataset contains ~1,340 coffee samples (Arabica + Robusta) with sensory evaluations.
+| Option    | Description                     |
+|-----------|---------------------------------|
+| `--force` | Re-download even if data exists |
+### Preprocess Data
+```bash
+uv run preprocess [--test-size 0.2] [--seed 42]
+```
+Processes raw data and creates train/test splits:
+1. Loads CSV files from `data/raw/`
+2. Selects taste features and metadata columns
+3. Drops rows with missing quality scores
+4. Normalizes features using StandardScaler (fit on train only)
+5. Splits data 80/20 train/test
+6. Saves to `data/processed/`:
+    - `train.parquet` - Training data
+    - `test.parquet` - Test data
+    - `scaler.pkl` - Fitted scaler for inference
+| Option        | Description                                   |
+|---------------|-----------------------------------------------|
+| `--test-size` | Fraction for test set (default: 0.2)          |
+| `--seed`      | Random seed for reproducibility (default: 42) |
+## Models
+### 1. Naive Baseline (`NaiveBaselineRecommender`)
+Establishes a performance floor using simple heuristics.
+**Strategies:**
+- `mean`: Recommends coffees closest to the global mean taste profile (ignores user preferences)
+- `weighted_random`: Random sampling weighted by Total Cup Points
+**When to use:** Sanity check; any useful model should beat this.
+### 2. Classical ML (`ClassicalMLRecommender`)
+Uses traditional similarity-based methods.
+**Methods:**
+- `knn`: K-Nearest Neighbors with Euclidean distance (sklearn NearestNeighbors)
+- `cosine`: Cosine similarity ranking
+**Features:**
+- Optional feature normalization via StandardScaler
+- Configurable number of neighbors
+**When to use:** Fast inference, interpretable results, works well with small datasets.
+### 3. Neural Network (`NeuralRecommender`)
+Learns taste embeddings via contrastive learning.
+**Architecture:**
+- MLP encoder with residual connections
+- Maps 9 taste features to 32-dimensional embedding space
+- L2-normalized embeddings for cosine similarity
+**Training:**
+- Triplet loss with margin
+- AdamW optimizer with cosine annealing
+- Automatic positive/negative mining based on taste distance
+**When to use:** Best performance with sufficient data; captures non-linear relationships.
+### Training Models
+```bash
+uv run train [--models baseline classical neural] [--device cuda]
+```
+| Option     | Description                                                  |
+|------------|--------------------------------------------------------------|
+| `--models` | Models to train: `baseline`, `classical`, `neural`, or `all` |
+| `--device` | PyTorch device: `cuda` or `cpu` (auto-detected)              |
+Models are saved to `models/checkpoints/`:
+- `baseline.pkl` - Pickled baseline model
+- `classical.pkl` - Pickled KNN model
+- `neural.pt` - PyTorch neural model
+## Hyperparameter Tuning
+BrewMatch includes automated hyperparameter optimization using [Optuna](https://optuna.org/), a Bayesian optimization framework with tree-structured Parzen estimators (TPE). Tuning is integrated into the training script.
+### Training Workflow
+```bash
+# First run: uses default hyperparameters
+uv run train
+# Run with Optuna tuning (saves best params for future runs)
+uv run train --tune
+# Subsequent runs: automatically uses previously tuned hyperparameters
+uv run train
+# Re-tune anytime with --tune flag
+uv run train --tune --neural-trials 100
+```
+| Option              | Description                                                |
+|---------------------|------------------------------------------------------------|
+| `--tune`            | Run Optuna tuning before training                          |
+| `--models`          | Models to train/tune: `baseline`, `classical`, `neural`, `all` |
+| `--neural-trials`   | Number of Optuna trials for neural network (default: 50)   |
+| `--classical-trials`| Number of Optuna trials for classical ML (default: 30)     |
+| `--cv-folds`        | Cross-validation folds for tuning (default: 3)             |
+| `--device`          | PyTorch device: `cuda`, `mps`, or `cpu` (auto-detected)    |
+### Tuned Hyperparameters
+**Neural Network:**
+- `embedding_dim`: Embedding space dimension (16-128)
+- `hidden_dim`: Hidden layer size (32-256)
+- `learning_rate`: Adam learning rate (1e-4 to 1e-2, log scale)
+- `margin`: Triplet loss margin (0.1-1.0)
+- `batch_size`: Training batch size (16, 32, 64, 128)
+**Classical ML:**
+- `method`: Similarity method (`knn` or `cosine`)
+- `n_neighbors`: Number of neighbors for KNN (5-100)
+- `normalize`: Feature normalization (True/False)
+### Outputs
+Tuned hyperparameters are saved to `models/checkpoints/hyperparameters.json` and automatically loaded on subsequent training runs
+## Evaluation
+### Metrics
+| Metric          | Description                                                          |
+|-----------------|----------------------------------------------------------------------|
+| **Precision@K** | Proportion of top-K recommendations that are relevant                |
+| **Recall@K**    | Proportion of relevant items found in top-K                          |
+| **NDCG@K**      | Normalized Discounted Cumulative Gain (rewards early relevant items) |
+| **MSE**         | Mean Squared Error of taste profile predictions                      |
+| **MAE**         | Mean Absolute Error of taste profile predictions                     |
+**Relevance definition:** A coffee is relevant if it shares the same country AND processing method as the query, OR has
+cosine similarity >= 0.95.
+### Running Evaluation
+```bash
+uv run evaluate [--models all] [--error-analysis] [--output results.json]
+```
+| Option             | Description                                                     |
+|--------------------|-----------------------------------------------------------------|
+| `--models`         | Models to evaluate: `baseline`, `classical`, `neural`, or `all` |
+| `--error-analysis` | Generate detailed error analysis                                |
+| `--output`         | Save results to JSON file                                       |
+### Error Analysis
+The error analysis module identifies:
+1. **5 Worst Mispredictions** with root cause analysis:
+    - Origin mismatch
+    - Processing method mismatch
+    - Large taste profile deviations
+2. **Common Error Patterns**:
+    - Failures by country of origin
+    - Failures by processing method
+    - Cross-origin confusion (e.g., confusing Ethiopia with Kenya)
+    - Taste profile edge cases (high acidity, low body)
+3. **Mitigation Strategies**:
+    - Origin-aware embeddings
+    - Processing method features
+    - Contrastive learning for confused origins
+    - Re-ranking stages
+## Experiment: Sensitivity Analysis
+Investigates how model performance varies with training set size.
+### Hypothesis
+Deep learning models benefit more from additional data, while classical models plateau earlier.
+### Running the Experiment
+```bash
+uv run experiment [--fractions 0.1 0.2 ... 1.0] [--trials 3] [--device cuda]
+```
+| Option         | Description                                              |
+|----------------|----------------------------------------------------------|
+| `--fractions`  | Training set fractions to test (default: 0.1 to 1.0)     |
+| `--trials`     | Trials per fraction for variance estimation (default: 3) |
+| `--device`     | PyTorch device                                           |
+| `--output-dir` | Directory for results (default: `experiments/`)          |
+### Outputs
+- `raw_results.json` - Per-trial metrics
+- `aggregated_results.csv` - Mean and std per model/fraction
+- `sensitivity_analysis.png` - Performance vs. training size plot
+- `sensitivity_analysis_multi.png` - Multi-metric comparison
+- `experiment_report.txt` - Text summary with findings
+## API Reference
+### Starting the Server
+```bash
+uv run serve
+```
+Or with environment variables:
+```bash
+FLASK_HOST=0.0.0.0 FLASK_PORT=8000 FLASK_DEBUG=true uv run serve
+```
+### Endpoints
+#### Health Check
+```http
+GET /health
+```
+**Response:**
+```json
+{
+  "status": "healthy",
+  "models_loaded": 3,
+  "available_models": [
+    "baseline",
+    "classical",
+    "neural"
+  ]
+}
+```
+#### List Models
+```http
+GET /api/models
+```
+**Response:**
+```json
+{
+  "models": [
+    {
+      "name": "baseline",
+      "available": true,
+      "is_fitted": true
+    },
+    {
+      "name": "classical",
+      "available": true,
+      "is_fitted": true
+    },
+    {
+      "name": "neural",
+      "available": true,
+      "is_fitted": true
+    }
+  ]
+}
+```
+#### Get Recommendations
+```http
+POST /api/recommend
+Content-Type: application/json
+{
+  "preferences": {
+    "aroma": 8.0,
+    "flavor": 7.5,
+    "aftertaste": 7.0,
+    "acidity": 7.5,
+    "body": 8.0,
+    "balance": 7.5,
+    "uniformity": 10.0,
+    "clean_cup": 10.0,
+    "sweetness": 10.0
+  },
+  "model": "neural",
+  "k": 5
+}
+```
+**Response:**
+```json
+{
+  "recommendations": [
+    {
+      "id": 42,
+      "similarity": 0.95,
+      "scores": {
+        "aroma": 7.92,
+        "flavor": 7.58
+      },
+      "country": "Ethiopia",
+      "metadata": {}
+    }
+  ],
+  "model_used": "neural",
+  "k": 5
+}
+```
+| Field         | Type    | Description                                                        |
+|---------------|---------|--------------------------------------------------------------------|
+| `preferences` | object  | Required. All 9 taste features (0-10 scale)                        |
+| `model`       | string  | Optional. `baseline`, `classical`, or `neural` (default: `neural`) |
+| `k`           | integer | Optional. Number of recommendations (1-100, default: 5)            |
+#### Get Coffee Details
+```http
+GET /api/coffee/{id}
+```
+**Response:**
+```json
+{
+  "id": 42,
+  "metadata": {
+    "Country.of.Origin": "Ethiopia",
+    "Processing.Method": "Washed / Wet"
+  },
+  "taste_profile": {
+    "aroma": 7.92
+  }
+}
+```
+#### Get Statistics
+```http
+GET /api/stats
+```
+**Response:**
+```json
+{
+  "total_coffees": 1200,
+  "models": {
+    "baseline": {
+      "is_fitted": true,
+      "training_samples": 960
+    },
+    "classical": {
+      "is_fitted": true,
+      "training_samples": 960
+    },
+    "neural": {
+      "is_fitted": true,
+      "training_samples": 960
+    }
+  }
+}
+```
+### Error Responses
+| Status | Description                               |
+|--------|-------------------------------------------|
+| 400    | Validation error (missing/invalid fields) |
+| 404    | Resource not found                        |
+| 503    | No models loaded                          |
+| 500    | Internal server error                     |
+## Deployment
+### Production with Gunicorn
+```bash
+uv run gunicorn "brewmatch.api.app:create_app()" \
+  --bind 0.0.0.0:8000 \
+  --workers 4 \
+  --timeout 120
+```
+### Docker
+```dockerfile
+FROM python:3.13-slim
+WORKDIR /app
+COPY . .
+RUN pip install uv && uv sync --frozen
+# Download and preprocess data, train models
+RUN uv run download && uv run preprocess && uv run train
+EXPOSE 8000
+CMD ["uv", "run", "gunicorn", "brewmatch.api.app:create_app()", "--bind", "0.0.0.0:8000"]
+```
+### Environment Variables
+| Variable      | Description         | Default     |
+|---------------|---------------------|-------------|
+| `FLASK_HOST`  | Server bind address | `127.0.0.1` |
+| `FLASK_PORT`  | Server port         | `5000`      |
+| `FLASK_DEBUG` | Enable debug mode   | `false`     |
+---
+**Dataset:** [Coffee Quality Database (CQI)](https://www.kaggle.com/datasets/volpatto/coffee-quality-database-from-cqi) by Diego Volpatto

pyproject.toml ADDED Viewed

	@@ -0,0 +1,49 @@

+[project]
+name = "brewmatch"
+version = "0.1.0"
+description = "Coffee recommendation system using ML - recommends coffee beans based on taste preferences"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "flask>=3.1.3",
+    "flask-cors>=6.0.2",
+    "gunicorn>=25.3.0",
+    "kagglehub>=1.0.0",
+    "matplotlib>=3.10.8",
+    "numpy>=2.4.3",
+    "optuna>=4.8.0",
+    "pandas>=3.0.2",
+    "pyarrow>=23.0.1",
+    "scikit-learn>=1.8.0",
+    "seaborn>=0.13.2",
+    "tabulate>=0.10.0",
+    "torch>=2.11.0",
+    "tqdm>=4.66.5",
+]
+[project.optional-dependencies]
+cuda = ["torch>=2.11.0"]
+[project.scripts]
+download = "brewmatch.data.download:main"
+preprocess = "brewmatch.data.preprocess:main"
+train = "brewmatch.train:main"
+evaluate = "brewmatch.evaluate:main"
+experiment = "brewmatch.experiment:main"
+serve = "brewmatch.api.app:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/brewmatch"]
+[[tool.uv.index]]
+name = "pytorch-cu130"
+url = "https://download.pytorch.org/whl/cu130"
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-cu130", extra = "cuda" },
+]

src/brewmatch/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """BrewMatch - Coffee Recommendation System using Machine Learning."""
2	+
3	+ __version__ = "0.1.0"

src/brewmatch/api/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Flask API for coffee recommendations."""
+from .app import create_app, main
+from .schemas import (
+    TASTE_FEATURES,
+    VALID_MODELS,
+    ValidationError,
+    validate_preferences,
+    validate_model_name,
+    validate_k,
+    validate_coffee_id,
+    validate_recommend_request,
+)
+__all__ = [
+    "create_app",
+    "main",
+    "TASTE_FEATURES",
+    "VALID_MODELS",
+    "ValidationError",
+    "validate_preferences",
+    "validate_model_name",
+    "validate_k",
+    "validate_coffee_id",
+    "validate_recommend_request",
+]

src/brewmatch/api/app.py ADDED Viewed

	@@ -0,0 +1,343 @@

+"""Flask API for BrewMatch coffee recommendations."""
+import logging
+import os
+from pathlib import Path
+from typing import Any
+import numpy as np
+from flask import Flask, jsonify, request
+from flask_cors import CORS
+from brewmatch.models import (
+    ClassicalMLRecommender,
+    NaiveBaselineRecommender,
+    NeuralRecommender,
+)
+from brewmatch.models.base import BaseRecommender
+from .schemas import (
+    TASTE_FEATURES,
+    VALID_MODELS,
+    ValidationError,
+    validate_coffee_id,
+    validate_recommend_request,
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+# Model type mapping
+MODEL_CLASSES: dict[str, type[BaseRecommender]] = {
+    "baseline": NaiveBaselineRecommender,
+    "classical": ClassicalMLRecommender,
+    "neural": NeuralRecommender,
+}
+# File extensions for each model type
+MODEL_EXTENSIONS: dict[str, str] = {
+    "baseline": ".pkl",
+    "classical": ".pkl",
+    "neural": ".pt",
+}
+def load_models(checkpoint_dir: Path) -> dict[str, BaseRecommender]:
+    """Load all available models from the checkpoint directory.
+    Args:
+        checkpoint_dir: Path to the directory containing model checkpoints.
+    Returns:
+        Dictionary mapping model names to loaded model instances.
+    """
+    models: dict[str, BaseRecommender] = {}
+    if not checkpoint_dir.exists():
+        logger.warning(f"Checkpoint directory does not exist: {checkpoint_dir}")
+        return models
+    for model_name, model_class in MODEL_CLASSES.items():
+        extension = MODEL_EXTENSIONS[model_name]
+        model_path = checkpoint_dir / f"{model_name}{extension}"
+        if model_path.exists():
+            try:
+                logger.info(f"Loading {model_name} model from {model_path}")
+                models[model_name] = model_class.load(model_path)
+                logger.info(f"Successfully loaded {model_name} model")
+            except Exception as e:
+                logger.error(f"Failed to load {model_name} model: {e}")
+        else:
+            logger.info(f"No checkpoint found for {model_name} at {model_path}")
+    return models
+def create_app(config: dict[str, Any] | None = None) -> Flask:
+    """Create and configure the Flask application.
+    Args:
+        config: Optional configuration dictionary. Supported keys:
+            - CHECKPOINT_DIR: Path to model checkpoints directory.
+            - TESTING: Enable testing mode.
+            - DEBUG: Enable debug mode.
+    Returns:
+        Configured Flask application instance.
+    """
+    app = Flask(__name__)
+    # Apply configuration
+    if config:
+        app.config.update(config)
+    # Enable CORS for all routes
+    CORS(app)
+    # Determine checkpoint directory
+    checkpoint_dir = app.config.get("CHECKPOINT_DIR")
+    if checkpoint_dir:
+        checkpoint_dir = Path(checkpoint_dir)
+    else:
+        # Default to models/checkpoints relative to project root
+        # Path: app.py -> api -> brewmatch -> src -> project_root
+        checkpoint_dir = Path(__file__).parent.parent.parent.parent / "models" / "checkpoints"
+    # Load models on startup
+    app.models: dict[str, BaseRecommender] = load_models(checkpoint_dir)
+    # Store coffee data reference (populated when first model is loaded)
+    app.coffee_data: dict[int, dict[str, Any]] = {}
+    # Build coffee data index from loaded models
+    if app.models:
+        first_model = next(iter(app.models.values()))
+        if hasattr(first_model, "_metadata") and first_model._metadata is not None:
+            for idx in range(len(first_model._metadata)):
+                row = first_model._metadata.iloc[idx]
+                app.coffee_data[idx] = {
+                    "id": idx,
+                    "metadata": row.to_dict(),
+                }
+                # Add taste profile if available
+                if hasattr(first_model, "_X") and first_model._X is not None:
+                    taste_profile = first_model._X[idx]
+                    app.coffee_data[idx]["taste_profile"] = {
+                        feature.lower().replace(" ", "_"): float(taste_profile[i])
+                        for i, feature in enumerate(BaseRecommender.TASTE_FEATURES)
+                    }
+    @app.errorhandler(ValidationError)
+    def handle_validation_error(error: ValidationError):
+        """Handle validation errors with proper JSON response."""
+        response = {"error": error.message}
+        if error.field:
+            response["field"] = error.field
+        return jsonify(response), 400
+    @app.errorhandler(404)
+    def handle_not_found(error):
+        """Handle 404 errors."""
+        return jsonify({"error": "Resource not found"}), 404
+    @app.errorhandler(500)
+    def handle_internal_error(error):
+        """Handle internal server errors."""
+        logger.exception("Internal server error")
+        return jsonify({"error": "Internal server error"}), 500
+    @app.route("/health", methods=["GET"])
+    def health_check():
+        """Health check endpoint.
+        Returns:
+            JSON response with status and loaded models count.
+        """
+        return jsonify({
+            "status": "healthy",
+            "models_loaded": len(app.models),
+            "available_models": list(app.models.keys()),
+        })
+    @app.route("/api/models", methods=["GET"])
+    def list_models():
+        """List available recommendation models.
+        Returns:
+            JSON response with list of available models and their status.
+        """
+        models_info = []
+        for model_name in VALID_MODELS:
+            model_info = {
+                "name": model_name,
+                "available": model_name in app.models,
+            }
+            if model_name in app.models:
+                model = app.models[model_name]
+                model_info["is_fitted"] = model.is_fitted
+            models_info.append(model_info)
+        return jsonify({"models": models_info})
+    @app.route("/api/recommend", methods=["POST"])
+    def get_recommendations():
+        """Get coffee recommendations based on taste preferences.
+        Request body:
+            {
+                "preferences": {
+                    "aroma": 8.0,
+                    "flavor": 7.5,
+                    "aftertaste": 7.0,
+                    "acidity": 7.5,
+                    "body": 8.0,
+                    "balance": 7.5,
+                    "uniformity": 10.0,
+                    "clean_cup": 10.0,
+                    "sweetness": 10.0
+                },
+                "model": "neural",
+                "k": 5
+            }
+        Returns:
+            JSON response with list of recommended coffees.
+        """
+        data = request.get_json(silent=True)
+        validated = validate_recommend_request(data)
+        model_name = validated["model"]
+        preferences = validated["preferences"]
+        k = validated["k"]
+        # Check if requested model is available
+        if model_name not in app.models:
+            available = list(app.models.keys())
+            if not available:
+                return jsonify({
+                    "error": "No models are currently loaded",
+                }), 503
+            return jsonify({
+                "error": f"Model '{model_name}' is not available",
+                "available_models": available,
+            }), 400
+        model = app.models[model_name]
+        # Convert preferences dict to numpy array in correct order
+        # Map API field names to model feature names
+        feature_mapping = {
+            "aroma": "Aroma",
+            "flavor": "Flavor",
+            "aftertaste": "Aftertaste",
+            "acidity": "Acidity",
+            "body": "Body",
+            "balance": "Balance",
+            "uniformity": "Uniformity",
+            "clean_cup": "Clean Cup",
+            "sweetness": "Sweetness",
+        }
+        preferences_array = np.array([
+            preferences[feature.lower().replace(" ", "_")]
+            for feature in BaseRecommender.TASTE_FEATURES
+        ], dtype=np.float32)
+        try:
+            recommendations = model.recommend(preferences_array, k=k)
+        except Exception as e:
+            logger.exception("Error generating recommendations")
+            return jsonify({"error": f"Failed to generate recommendations: {str(e)}"}), 500
+        # Format response
+        formatted_recommendations = []
+        for rec in recommendations:
+            formatted_rec = {
+                "id": rec["index"],
+                "similarity": rec["score"],
+                "scores": {
+                    key.lower().replace(" ", "_"): value
+                    for key, value in rec["taste_profile"].items()
+                },
+            }
+            # Add metadata fields at top level for convenience
+            if rec.get("metadata"):
+                formatted_rec["country"] = rec["metadata"].get("Country of Origin", "Unknown")
+                formatted_rec["metadata"] = rec["metadata"]
+            formatted_recommendations.append(formatted_rec)
+        return jsonify({
+            "recommendations": formatted_recommendations,
+            "model_used": model_name,
+            "k": k,
+        })
+    @app.route("/api/coffee/<int:coffee_id>", methods=["GET"])
+    def get_coffee(coffee_id: int):
+        """Get details for a specific coffee by ID.
+        Args:
+            coffee_id: The ID of the coffee to retrieve.
+        Returns:
+            JSON response with coffee details.
+        """
+        validated_id = validate_coffee_id(coffee_id)
+        if validated_id not in app.coffee_data:
+            return jsonify({"error": f"Coffee with id {validated_id} not found"}), 404
+        coffee = app.coffee_data[validated_id]
+        return jsonify(coffee)
+    @app.route("/api/stats", methods=["GET"])
+    def get_stats():
+        """Get model performance statistics.
+        Returns:
+            JSON response with statistics about loaded models and data.
+        """
+        stats: dict[str, Any] = {
+            "total_coffees": len(app.coffee_data),
+            "models": {},
+        }
+        for model_name, model in app.models.items():
+            model_stats: dict[str, Any] = {
+                "is_fitted": model.is_fitted,
+            }
+            if hasattr(model, "_X") and model._X is not None:
+                model_stats["training_samples"] = len(model._X)
+            if hasattr(model, "_metadata") and model._metadata is not None:
+                model_stats["metadata_columns"] = list(model._metadata.columns)
+            stats["models"][model_name] = model_stats
+        return jsonify(stats)
+    return app
+def main() -> None:
+    """Entry point for running the Flask development server.
+    This function is called by `uv run serve`. For production deployments,
+    use a WSGI server like gunicorn instead.
+    """
+    host = os.environ.get("FLASK_HOST", "127.0.0.1")
+    port = int(os.environ.get("FLASK_PORT", "5000"))
+    debug = os.environ.get("FLASK_DEBUG", "false").lower() == "true"
+    logger.info(f"Starting BrewMatch API server on {host}:{port}")
+    app = create_app()
+    app.run(host=host, port=port, debug=debug)
+if __name__ == "__main__":
+    main()

src/brewmatch/api/schemas.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""Request/response validation schemas for the BrewMatch API."""
+from typing import Any
+# The 9 taste preference features
+TASTE_FEATURES = [
+    "aroma",
+    "flavor",
+    "aftertaste",
+    "acidity",
+    "body",
+    "balance",
+    "uniformity",
+    "clean_cup",
+    "sweetness",
+]
+# Valid model names
+VALID_MODELS = ["baseline", "classical", "neural"]
+class ValidationError(Exception):
+    """Raised when request validation fails."""
+    def __init__(self, message: str, field: str | None = None) -> None:
+        self.message = message
+        self.field = field
+        super().__init__(message)
+def validate_preferences(preferences: dict[str, Any] | None) -> dict[str, float]:
+    """Validate taste preferences from API request.
+    Args:
+        preferences: Dictionary of taste preferences with feature names as keys
+            and scores as values. All 9 features must be present.
+    Returns:
+        Validated preferences dictionary with float values.
+    Raises:
+        ValidationError: If preferences are invalid.
+    """
+    if preferences is None:
+        raise ValidationError("preferences is required", field="preferences")
+    if not isinstance(preferences, dict):
+        raise ValidationError(
+            "preferences must be an object", field="preferences"
+        )
+    validated: dict[str, float] = {}
+    missing_fields = []
+    for feature in TASTE_FEATURES:
+        if feature not in preferences:
+            missing_fields.append(feature)
+            continue
+        value = preferences[feature]
+        if value is None:
+            raise ValidationError(
+                f"{feature} cannot be null", field=f"preferences.{feature}"
+            )
+        try:
+            float_value = float(value)
+        except (TypeError, ValueError):
+            raise ValidationError(
+                f"{feature} must be a number", field=f"preferences.{feature}"
+            )
+        if float_value < 0.0 or float_value > 10.0:
+            raise ValidationError(
+                f"{feature} must be between 0 and 10", field=f"preferences.{feature}"
+            )
+        validated[feature] = float_value
+    if missing_fields:
+        raise ValidationError(
+            f"Missing required fields: {', '.join(missing_fields)}",
+            field="preferences",
+        )
+    return validated
+def validate_model_name(model: str | None) -> str:
+    """Validate model name from API request.
+    Args:
+        model: Name of the model to use. Must be one of 'baseline', 'classical',
+            or 'neural'. Defaults to 'neural' if not provided.
+    Returns:
+        Validated model name.
+    Raises:
+        ValidationError: If model name is invalid.
+    """
+    if model is None:
+        return "neural"
+    if not isinstance(model, str):
+        raise ValidationError("model must be a string", field="model")
+    model = model.lower().strip()
+    if model not in VALID_MODELS:
+        raise ValidationError(
+            f"model must be one of: {', '.join(VALID_MODELS)}", field="model"
+        )
+    return model
+def validate_k(k: Any | None) -> int:
+    """Validate k (number of recommendations) from API request.
+    Args:
+        k: Number of recommendations to return. Must be a positive integer
+            between 1 and 100. Defaults to 5 if not provided.
+    Returns:
+        Validated k value.
+    Raises:
+        ValidationError: If k is invalid.
+    """
+    if k is None:
+        return 5
+    try:
+        k_int = int(k)
+    except (TypeError, ValueError):
+        raise ValidationError("k must be an integer", field="k")
+    if k_int < 1:
+        raise ValidationError("k must be at least 1", field="k")
+    if k_int > 100:
+        raise ValidationError("k must be at most 100", field="k")
+    return k_int
+def validate_coffee_id(coffee_id: Any) -> int:
+    """Validate coffee ID from API request.
+    Args:
+        coffee_id: ID of the coffee to retrieve.
+    Returns:
+        Validated coffee ID as integer.
+    Raises:
+        ValidationError: If coffee ID is invalid.
+    """
+    if coffee_id is None:
+        raise ValidationError("coffee id is required", field="id")
+    try:
+        id_int = int(coffee_id)
+    except (TypeError, ValueError):
+        raise ValidationError("coffee id must be an integer", field="id")
+    if id_int < 0:
+        raise ValidationError("coffee id must be non-negative", field="id")
+    return id_int
+def validate_recommend_request(data: dict[str, Any] | None) -> dict[str, Any]:
+    """Validate the full recommendation request body.
+    Args:
+        data: Request body dictionary containing preferences, model, and k.
+    Returns:
+        Validated request data with all fields populated.
+    Raises:
+        ValidationError: If request data is invalid.
+    """
+    if data is None:
+        raise ValidationError("Request body is required")
+    if not isinstance(data, dict):
+        raise ValidationError("Request body must be a JSON object")
+    return {
+        "preferences": validate_preferences(data.get("preferences")),
+        "model": validate_model_name(data.get("model")),
+        "k": validate_k(data.get("k")),
+    }

src/brewmatch/config.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Configuration settings for BrewMatch."""
+from pathlib import Path
+# Project paths
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+DATA_DIR = PROJECT_ROOT / "data"
+RAW_DATA_DIR = DATA_DIR / "raw"
+PROCESSED_DATA_DIR = DATA_DIR / "processed"
+MODELS_DIR = PROJECT_ROOT / "models"
+CHECKPOINTS_DIR = MODELS_DIR / "checkpoints"
+# Ensure directories exist
+for dir_path in [RAW_DATA_DIR, PROCESSED_DATA_DIR, CHECKPOINTS_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# Dataset settings
+KAGGLE_DATASET = "fatihb/coffee-quality-data-cqi"
+RANDOM_SEED = 42
+TEST_SIZE = 0.2
+VAL_SIZE = 0.1
+# Feature columns (taste profile) - using actual CSV column names
+TASTE_FEATURES = [
+    "Aroma",
+    "Flavor",
+    "Aftertaste",
+    "Acidity",
+    "Body",
+    "Balance",
+    "Uniformity",
+    "Clean Cup",
+    "Sweetness",
+]
+# Metadata columns to preserve
+METADATA_COLUMNS = [
+    "Country of Origin",
+    "Region",
+    "Processing Method",
+    "Variety",
+    "Color",
+    "Total Cup Points",
+]
+# Model hyperparameters
+BASELINE_CONFIG = {
+    "strategy": "mean_similarity",  # or "quality_weighted_random"
+}
+CLASSICAL_CONFIG = {
+    "n_neighbors": 10,
+    "metric": "cosine",
+    "algorithm": "brute",
+}
+NEURAL_CONFIG = {
+    "embedding_dim": 32,
+    "hidden_dims": [64, 32],
+    "learning_rate": 0.001,
+    "batch_size": 32,
+    "epochs": 100,
+    "margin": 0.5,  # for triplet loss
+    "patience": 10,  # early stopping
+}
+# Evaluation settings
+K_VALUES = [1, 3, 5, 10]
+# API settings
+API_HOST = "0.0.0.0"
+API_PORT = 5000
+DEBUG = False

src/brewmatch/data/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Data loading and preprocessing modules."""
+from .download import download_data
+from .preprocess import preprocess_data, load_processed_data
+from .dataset import CoffeeDataset, create_dataloaders
+__all__ = [
+    "download_data",
+    "preprocess_data",
+    "load_processed_data",
+    "CoffeeDataset",
+    "create_dataloaders",
+]

src/brewmatch/data/dataset.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""PyTorch dataset and dataloaders for coffee quality data."""
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader, Dataset
+from .preprocess import TASTE_FEATURES, TARGET_COLUMN, load_processed_data
+class CoffeeDataset(Dataset):
+    """
+    PyTorch Dataset for coffee quality data.
+    Provides (features, target) pairs where features are the 9 taste
+    profile scores and target is the total cup points.
+    Attributes:
+        features: Tensor of shape (n_samples, 9) with normalized taste features.
+        targets: Tensor of shape (n_samples,) with total cup points.
+        metadata: Optional DataFrame with metadata columns.
+    """
+    def __init__(
+        self,
+        features: np.ndarray | torch.Tensor,
+        targets: np.ndarray | torch.Tensor,
+        metadata: pd.DataFrame | None = None,
+    ) -> None:
+        """
+        Initialize the dataset.
+        Args:
+            features: Array of shape (n_samples, n_features) with input features.
+            targets: Array of shape (n_samples,) with target values.
+            metadata: Optional DataFrame with metadata (not used in training).
+        """
+        if isinstance(features, np.ndarray):
+            features = torch.from_numpy(features).float()
+        if isinstance(targets, np.ndarray):
+            targets = torch.from_numpy(targets).float()
+        self.features = features
+        self.targets = targets
+        self.metadata = metadata
+    def __len__(self) -> int:
+        """Return the number of samples."""
+        return len(self.features)
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get a sample by index.
+        Args:
+            idx: Sample index.
+        Returns:
+            Tuple of (features, target) tensors.
+        """
+        return self.features[idx], self.targets[idx]
+    @classmethod
+    def from_dataframe(
+        cls,
+        df: pd.DataFrame,
+        feature_cols: list[str] | None = None,
+        target_col: str | None = None,
+    ) -> "CoffeeDataset":
+        """
+        Create a dataset from a pandas DataFrame.
+        Args:
+            df: DataFrame with features and target.
+            feature_cols: List of feature column names (default: TASTE_FEATURES).
+            target_col: Target column name (default: TARGET_COLUMN).
+        Returns:
+            CoffeeDataset instance.
+        """
+        if feature_cols is None:
+            feature_cols = TASTE_FEATURES
+        if target_col is None:
+            target_col = TARGET_COLUMN
+        features = df[feature_cols].values
+        targets = df[target_col].values
+        # Get metadata columns (everything that's not a feature or target)
+        metadata_cols = [c for c in df.columns if c not in feature_cols and c != target_col]
+        metadata = df[metadata_cols] if metadata_cols else None
+        return cls(features, targets, metadata)
+def create_dataloaders(
+    batch_size: int = 32,
+    val_split: float = 0.1,
+    num_workers: int = 0,
+    random_state: int = 42,
+) -> dict[str, Any]:
+    """
+    Create train, validation, and test DataLoaders.
+    Splits the training data into train/validation sets, keeps test set separate.
+    Args:
+        batch_size: Batch size for all loaders (default: 32).
+        val_split: Fraction of training data for validation (default: 0.1).
+        num_workers: Number of workers for data loading (default: 0).
+        random_state: Random seed for train/val split (default: 42).
+    Returns:
+        Dictionary containing:
+        - train_loader: DataLoader for training
+        - val_loader: DataLoader for validation
+        - test_loader: DataLoader for testing
+        - train_dataset: Training CoffeeDataset
+        - val_dataset: Validation CoffeeDataset
+        - test_dataset: Test CoffeeDataset
+        - n_features: Number of input features (9)
+        - scaler: The fitted StandardScaler
+    Raises:
+        FileNotFoundError: If processed data doesn't exist.
+    """
+    # Load processed data
+    data = load_processed_data()
+    train_df = data["train_df"]
+    test_df = data["test_df"]
+    scaler = data["scaler"]
+    feature_cols = data["taste_features"]
+    target_col = data["target_column"]
+    # Split training data into train/val
+    n_train = len(train_df)
+    n_val = int(n_train * val_split)
+    # Shuffle with fixed seed
+    rng = np.random.default_rng(random_state)
+    indices = rng.permutation(n_train)
+    val_indices = indices[:n_val]
+    train_indices = indices[n_val:]
+    train_subset_df = train_df.iloc[train_indices].reset_index(drop=True)
+    val_subset_df = train_df.iloc[val_indices].reset_index(drop=True)
+    # Create datasets
+    train_dataset = CoffeeDataset.from_dataframe(
+        train_subset_df, feature_cols, target_col
+    )
+    val_dataset = CoffeeDataset.from_dataframe(
+        val_subset_df, feature_cols, target_col
+    )
+    test_dataset = CoffeeDataset.from_dataframe(
+        test_df, feature_cols, target_col
+    )
+    # Create dataloaders
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+    )
+    print(f"Created dataloaders:")
+    print(f"  Train: {len(train_dataset)} samples, {len(train_loader)} batches")
+    print(f"  Val: {len(val_dataset)} samples, {len(val_loader)} batches")
+    print(f"  Test: {len(test_dataset)} samples, {len(test_loader)} batches")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Features: {len(feature_cols)}")
+    return {
+        "train_loader": train_loader,
+        "val_loader": val_loader,
+        "test_loader": test_loader,
+        "train_dataset": train_dataset,
+        "val_dataset": val_dataset,
+        "test_dataset": test_dataset,
+        "n_features": len(feature_cols),
+        "scaler": scaler,
+    }

src/brewmatch/data/download.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Download the CQI coffee quality dataset from Kaggle."""
+import shutil
+from pathlib import Path
+import kagglehub
+def get_project_root() -> Path:
+    """Get the project root directory (where pyproject.toml is located)."""
+    current = Path(__file__).resolve()
+    for parent in current.parents:
+        if (parent / "pyproject.toml").exists():
+            return parent
+    raise RuntimeError("Could not find project root (no pyproject.toml found)")
+def download_data(force: bool = False) -> Path:
+    """
+    Download the CQI coffee quality dataset from Kaggle.
+    Uses kagglehub to download the dataset and copies files to data/raw/.
+    Args:
+        force: If True, re-download even if data already exists.
+    Returns:
+        Path to the raw data directory containing the downloaded files.
+    Raises:
+        RuntimeError: If download fails or no CSV files are found.
+    """
+    project_root = get_project_root()
+    raw_dir = project_root / "data" / "raw"
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    # Check if data already exists
+    existing_csvs = list(raw_dir.glob("*.csv"))
+    if existing_csvs and not force:
+        print(f"Data already exists in {raw_dir} ({len(existing_csvs)} CSV files)")
+        print("Use force=True to re-download")
+        return raw_dir
+    print("Downloading CQI coffee quality dataset from Kaggle...")
+    print("Dataset: volpatto/coffee-quality-database-from-cqi")
+    # kagglehub downloads to its cache directory
+    # Using volpatto's dataset which has both Arabica (~1300) and Robusta (~28) samples
+    cache_path = kagglehub.dataset_download("volpatto/coffee-quality-database-from-cqi")
+    cache_path = Path(cache_path)
+    print(f"Downloaded to cache: {cache_path}")
+    # Find all CSV files in the downloaded data
+    csv_files = list(cache_path.glob("**/*.csv"))
+    if not csv_files:
+        raise RuntimeError(f"No CSV files found in downloaded data at {cache_path}")
+    # Copy CSV files to raw directory
+    print(f"Copying {len(csv_files)} CSV file(s) to {raw_dir}")
+    for csv_file in csv_files:
+        dest = raw_dir / csv_file.name
+        shutil.copy2(csv_file, dest)
+        print(f"  - {csv_file.name}")
+    print(f"Data saved to {raw_dir}")
+    return raw_dir
+def main() -> None:
+    """Entry point for `uv run download`."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Download the CQI coffee quality dataset from Kaggle"
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Re-download even if data already exists",
+    )
+    args = parser.parse_args()
+    download_data(force=args.force)
+if __name__ == "__main__":
+    main()

src/brewmatch/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,311 @@

+"""Preprocess the CQI coffee quality dataset."""
+import pickle
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+# Taste profile features (9 total) - using actual CSV column names
+TASTE_FEATURES = [
+    "Aroma",
+    "Flavor",
+    "Aftertaste",
+    "Acidity",
+    "Body",
+    "Balance",
+    "Uniformity",
+    "Clean Cup",
+    "Sweetness",
+]
+# Target column
+TARGET_COLUMN = "Total Cup Points"
+# Metadata columns to preserve
+METADATA_COLUMNS = [
+    "Country of Origin",
+    "Processing Method",
+    "Variety",
+]
+def get_project_root() -> Path:
+    """Get the project root directory (where pyproject.toml is located)."""
+    current = Path(__file__).resolve()
+    for parent in current.parents:
+        if (parent / "pyproject.toml").exists():
+            return parent
+    raise RuntimeError("Could not find project root (no pyproject.toml found)")
+def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
+    """Normalize column names to use spaces instead of dots/underscores."""
+    # Map common column name variations
+    column_mapping = {
+        "Country.of.Origin": "Country of Origin",
+        "Processing.Method": "Processing Method",
+        "Clean.Cup": "Clean Cup",
+        "Total.Cup.Points": "Total Cup Points",
+        "Cupper.Points": "Cupper Points",
+        "Category.One.Defects": "Category One Defects",
+        "Category.Two.Defects": "Category Two Defects",
+    }
+    # Apply mapping
+    df = df.rename(columns=column_mapping)
+    return df
+def load_raw_data() -> pd.DataFrame:
+    """
+    Load raw CSV data from data/raw/.
+    Prefers merged_data_cleaned.csv if available (has most samples).
+    Falls back to combining all CSV files.
+    Returns:
+        Combined DataFrame from raw CSV files.
+    Raises:
+        FileNotFoundError: If no CSV files found in raw directory.
+    """
+    project_root = get_project_root()
+    raw_dir = project_root / "data" / "raw"
+    csv_files = list(raw_dir.glob("*.csv"))
+    if not csv_files:
+        raise FileNotFoundError(
+            f"No CSV files found in {raw_dir}. Run `uv run download` first."
+        )
+    # Prefer the merged dataset if available (has most samples)
+    merged_file = raw_dir / "merged_data_cleaned.csv"
+    if merged_file.exists():
+        print(f"Loading merged dataset: {merged_file.name}")
+        df = pd.read_csv(merged_file)
+        df = normalize_column_names(df)
+        return df
+    print(f"Found {len(csv_files)} CSV file(s) in {raw_dir}")
+    dfs = []
+    for csv_file in csv_files:
+        print(f"  Loading {csv_file.name}...")
+        df = pd.read_csv(csv_file)
+        df = normalize_column_names(df)
+        dfs.append(df)
+    if len(dfs) == 1:
+        return dfs[0]
+    # Combine multiple CSVs
+    combined = pd.concat(dfs, ignore_index=True)
+    print(f"Combined {len(dfs)} files into {len(combined)} rows")
+    return combined
+def preprocess_data(
+    test_size: float = 0.2,
+    random_state: int = 42,
+) -> dict[str, Any]:
+    """
+    Preprocess the coffee quality dataset.
+    Steps:
+    1. Load raw data
+    2. Select relevant columns
+    3. Drop rows with missing quality scores
+    4. Normalize numeric features
+    5. Split into train/test sets
+    6. Save processed data and scaler
+    Args:
+        test_size: Fraction of data for test set (default 0.2).
+        random_state: Random seed for reproducibility.
+    Returns:
+        Dictionary with paths to saved files:
+        - train_path: Path to training parquet
+        - test_path: Path to test parquet
+        - scaler_path: Path to scaler pickle
+    """
+    project_root = get_project_root()
+    processed_dir = project_root / "data" / "processed"
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    # Load raw data
+    print("Loading raw data...")
+    df = load_raw_data()
+    print(f"Loaded {len(df)} rows")
+    # Check for required columns
+    required_cols = TASTE_FEATURES + [TARGET_COLUMN]
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Select columns to keep
+    cols_to_keep = TASTE_FEATURES + [TARGET_COLUMN]
+    for col in METADATA_COLUMNS:
+        if col in df.columns:
+            cols_to_keep.append(col)
+    df = df[cols_to_keep].copy()
+    print(f"Selected {len(cols_to_keep)} columns")
+    # Report missing values before dropping
+    quality_cols = TASTE_FEATURES + [TARGET_COLUMN]
+    missing_before = df[quality_cols].isna().sum()
+    rows_with_missing = df[quality_cols].isna().any(axis=1).sum()
+    print(f"Rows with missing quality scores: {rows_with_missing}")
+    if missing_before.sum() > 0:
+        print("Missing values per column:")
+        for col, count in missing_before[missing_before > 0].items():
+            print(f"  {col}: {count}")
+    # Drop rows with missing quality scores
+    df_clean = df.dropna(subset=quality_cols)
+    dropped = len(df) - len(df_clean)
+    print(f"Dropped {dropped} rows with missing quality scores")
+    print(f"Remaining: {len(df_clean)} rows")
+    if len(df_clean) == 0:
+        raise ValueError("No data remaining after dropping missing values")
+    # Split into features and target
+    X = df_clean[TASTE_FEATURES].values
+    y = df_clean[TARGET_COLUMN].values
+    metadata = df_clean[[c for c in METADATA_COLUMNS if c in df_clean.columns]]
+    # Train/test split
+    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
+        X,
+        y,
+        df_clean.index.values,
+        test_size=test_size,
+        random_state=random_state,
+    )
+    print(f"Train set: {len(X_train)} samples")
+    print(f"Test set: {len(X_test)} samples")
+    # Fit scaler on training data only
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    print(f"Features normalized (mean={X_train_scaled.mean():.4f}, std={X_train_scaled.std():.4f})")
+    # Create DataFrames with scaled features
+    train_df = pd.DataFrame(X_train_scaled, columns=TASTE_FEATURES)
+    train_df[TARGET_COLUMN] = y_train
+    # Add metadata using original indices
+    for col in metadata.columns:
+        train_df[col] = metadata.loc[idx_train, col].values
+    test_df = pd.DataFrame(X_test_scaled, columns=TASTE_FEATURES)
+    test_df[TARGET_COLUMN] = y_test
+    for col in metadata.columns:
+        test_df[col] = metadata.loc[idx_test, col].values
+    # Save processed data
+    train_path = processed_dir / "train.parquet"
+    test_path = processed_dir / "test.parquet"
+    scaler_path = processed_dir / "scaler.pkl"
+    train_df.to_parquet(train_path, index=False)
+    test_df.to_parquet(test_path, index=False)
+    with open(scaler_path, "wb") as f:
+        pickle.dump(scaler, f)
+    print(f"\nSaved processed data:")
+    print(f"  Train: {train_path}")
+    print(f"  Test: {test_path}")
+    print(f"  Scaler: {scaler_path}")
+    return {
+        "train_path": train_path,
+        "test_path": test_path,
+        "scaler_path": scaler_path,
+    }
+def load_processed_data() -> dict[str, Any]:
+    """
+    Load preprocessed data from data/processed/.
+    Returns:
+        Dictionary containing:
+        - train_df: Training DataFrame
+        - test_df: Test DataFrame
+        - scaler: Fitted StandardScaler
+        - taste_features: List of taste feature column names
+        - target_column: Name of target column
+    Raises:
+        FileNotFoundError: If processed data doesn't exist.
+    """
+    project_root = get_project_root()
+    processed_dir = project_root / "data" / "processed"
+    train_path = processed_dir / "train.parquet"
+    test_path = processed_dir / "test.parquet"
+    scaler_path = processed_dir / "scaler.pkl"
+    # Check all files exist
+    for path in [train_path, test_path, scaler_path]:
+        if not path.exists():
+            raise FileNotFoundError(
+                f"Processed data not found: {path}. Run `uv run preprocess` first."
+            )
+    train_df = pd.read_parquet(train_path)
+    test_df = pd.read_parquet(test_path)
+    with open(scaler_path, "rb") as f:
+        scaler = pickle.load(f)
+    return {
+        "train_df": train_df,
+        "test_df": test_df,
+        "scaler": scaler,
+        "taste_features": TASTE_FEATURES,
+        "target_column": TARGET_COLUMN,
+    }
+def main() -> None:
+    """Entry point for `uv run preprocess`."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Preprocess the CQI coffee quality dataset"
+    )
+    parser.add_argument(
+        "--test-size",
+        type=float,
+        default=0.2,
+        help="Fraction of data for test set (default: 0.2)",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility (default: 42)",
+    )
+    args = parser.parse_args()
+    preprocess_data(test_size=args.test_size, random_state=args.seed)
+if __name__ == "__main__":
+    main()

src/brewmatch/device.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Device detection and selection utilities.
+Provides automatic device selection with fallback:
+CUDA (if available) > MPS (Apple Silicon) > CPU
+"""
+import torch
+def get_device(preferred: str | None = None) -> torch.device:
+    """
+    Get the best available device for PyTorch operations.
+    Priority: CUDA > MPS > CPU (unless preferred is specified)
+    Args:
+        preferred: Optional preferred device ("cuda", "mps", "cpu").
+                   If specified and available, uses that device.
+                   If not available, falls back to best available.
+    Returns:
+        torch.device for the selected device.
+    """
+    if preferred:
+        preferred = preferred.lower()
+        if preferred == "cuda" and torch.cuda.is_available():
+            return torch.device("cuda")
+        elif preferred == "mps" and torch.backends.mps.is_available():
+            return torch.device("mps")
+        elif preferred == "cpu":
+            return torch.device("cpu")
+        # Fall through to auto-detection if preferred not available
+    # Auto-detect best available
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        return torch.device("mps")
+    else:
+        return torch.device("cpu")
+def get_device_info() -> dict[str, bool | str]:
+    """
+    Get information about available devices.
+    Returns:
+        Dictionary with device availability and selected device.
+    """
+    info = {
+        "cuda_available": torch.cuda.is_available(),
+        "mps_available": torch.backends.mps.is_available(),
+        "selected": str(get_device()),
+    }
+    if torch.cuda.is_available():
+        info["cuda_device_name"] = torch.cuda.get_device_name(0)
+        info["cuda_device_count"] = torch.cuda.device_count()
+    return info
+def print_device_info() -> None:
+    """Print device information to console."""
+    info = get_device_info()
+    print(f"Device: {info['selected']}")
+    if info["cuda_available"]:
+        print(f"  CUDA: {info.get('cuda_device_name', 'Unknown')} "
+              f"(x{info.get('cuda_device_count', 1)})")
+    elif info["mps_available"]:
+        print("  MPS: Apple Silicon GPU")
+    else:
+        print("  CPU: No GPU acceleration available")

src/brewmatch/evaluate.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""Evaluation script for BrewMatch models."""
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from brewmatch.config import (
+    CHECKPOINTS_DIR,
+    K_VALUES,
+    TASTE_FEATURES,
+)
+from brewmatch.data import load_processed_data
+from brewmatch.models import (
+    NaiveBaselineRecommender,
+    ClassicalMLRecommender,
+    NeuralRecommender,
+)
+from brewmatch.evaluation import evaluate_model, generate_error_report
+def load_models() -> dict[str, Any]:
+    """Load all trained models."""
+    models = {}
+    baseline_path = CHECKPOINTS_DIR / "baseline.pkl"
+    if baseline_path.exists():
+        models["baseline"] = NaiveBaselineRecommender.load(baseline_path)
+        print(f"Loaded baseline model from {baseline_path}")
+    classical_path = CHECKPOINTS_DIR / "classical.pkl"
+    if classical_path.exists():
+        models["classical"] = ClassicalMLRecommender.load(classical_path)
+        print(f"Loaded classical model from {classical_path}")
+    neural_path = CHECKPOINTS_DIR / "neural.pt"
+    if neural_path.exists():
+        models["neural"] = NeuralRecommender.load(neural_path)
+        print(f"Loaded neural model from {neural_path}")
+    return models
+def compare_models(results: dict[str, dict[str, Any]]) -> None:
+    """Print comparison table of all models."""
+    print("\n" + "=" * 60)
+    print("MODEL COMPARISON")
+    print("=" * 60)
+    # Flatten nested dicts (precision@k, recall@k, etc.)
+    flat_results = {}
+    for model_name, metrics in results.items():
+        flat_metrics = {}
+        for key, value in metrics.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    flat_metrics[f"{key.replace('@k', '')}@{k}"] = v
+            elif isinstance(value, (int, float)) and not isinstance(value, bool):
+                flat_metrics[key] = value
+        flat_results[model_name] = flat_metrics
+    if not flat_results:
+        print("No results to compare.")
+        return
+    # Get all metric keys
+    all_keys = set()
+    for metrics in flat_results.values():
+        all_keys.update(metrics.keys())
+    all_keys = sorted(all_keys)
+    # Print table
+    header = f"{'Model':<12}" + "".join(f"{k:>12}" for k in all_keys)
+    print(header)
+    print("-" * len(header))
+    for model_name, metrics in flat_results.items():
+        row = f"{model_name:<12}"
+        for key in all_keys:
+            val = metrics.get(key, float("nan"))
+            if isinstance(val, float):
+                row += f"{val:>12.4f}"
+            else:
+                row += f"{val:>12}"
+        print(row)
+    # Find best model for primary metrics
+    print("\nBest model per metric:")
+    for key in ["precision@5", "ndcg@5", "recall@5"]:
+        if key in all_keys:
+            best_model = max(
+                flat_results.keys(),
+                key=lambda m: flat_results[m].get(key, 0)
+            )
+            best_value = flat_results[best_model].get(key, 0)
+            print(f"  - {key}: {best_model} ({best_value:.4f})")
+def main():
+    """Main evaluation entry point."""
+    parser = argparse.ArgumentParser(description="Evaluate BrewMatch models")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        choices=["baseline", "classical", "neural", "all"],
+        default=["all"],
+        help="Which models to evaluate",
+    )
+    parser.add_argument(
+        "--error-analysis",
+        action="store_true",
+        help="Generate detailed error analysis",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Save results to JSON file",
+    )
+    args = parser.parse_args()
+    # Load data
+    print("Loading test data...")
+    data = load_processed_data()
+    test_df = data["test_df"]
+    print(f"Test samples: {len(test_df)}")
+    print()
+    # Load models
+    print("Loading models...")
+    all_models = load_models()
+    if "all" in args.models:
+        models_to_eval = all_models
+    else:
+        models_to_eval = {k: v for k, v in all_models.items() if k in args.models}
+    if not models_to_eval:
+        print("No models found to evaluate!")
+        return
+    print(f"\nEvaluating: {list(models_to_eval.keys())}")
+    print()
+    # Prepare test data dict for evaluation
+    test_data = {
+        "X": test_df[TASTE_FEATURES].values,
+        "metadata": test_df,
+    }
+    # Evaluate each model
+    results = {}
+    for name, model in models_to_eval.items():
+        print(f"\n{'=' * 40}")
+        print(f"Evaluating: {name.upper()}")
+        print("=" * 40)
+        metrics = evaluate_model(
+            model=model,
+            test_data=test_data,
+            k_values=K_VALUES,
+        )
+        results[name] = metrics
+        print(f"\nResults for {name}:")
+        for metric, value in metrics.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    print(f"  {metric}@{k}: {v:.4f}")
+            elif isinstance(value, float):
+                print(f"  {metric}: {value:.4f}")
+            else:
+                print(f"  {metric}: {value}")
+        # Error analysis
+        if args.error_analysis:
+            print(f"\nError Analysis for {name}:")
+            report = generate_error_report(
+                model=model,
+                test_data=test_data,
+            )
+            print(f"  Error rate: {report.error_rate:.1%}")
+            print(f"  Total errors: {report.total_errors}/{report.total_queries}")
+            print("\n  Worst errors:")
+            for i, err in enumerate(report.worst_errors[:5], 1):
+                print(f"    {i}. Query {err.query_idx}: magnitude={err.error_magnitude:.3f}")
+                if "_root_cause" in err.query_metadata:
+                    print(f"       Root cause: {err.query_metadata['_root_cause']}")
+            print("\n  Patterns:")
+            for pattern in report.patterns[:3]:
+                print(f"    - {pattern.description} (freq: {pattern.frequency})")
+            print("\n  Mitigations:")
+            for mitigation in report.mitigations[:3]:
+                print(f"    - {mitigation[:80]}...")
+    # Compare models
+    if len(results) > 1:
+        compare_models(results)
+    # Save results
+    if args.output:
+        output_path = Path(args.output)
+        # Convert results to JSON-serializable format
+        json_results = {}
+        for model_name, metrics in results.items():
+            json_results[model_name] = {}
+            for key, value in metrics.items():
+                if isinstance(value, dict):
+                    json_results[model_name][key] = {str(k): v for k, v in value.items()}
+                else:
+                    json_results[model_name][key] = value
+        with open(output_path, "w") as f:
+            json.dump(json_results, f, indent=2)
+        print(f"\nResults saved to {output_path}")
+if __name__ == "__main__":
+    main()

src/brewmatch/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Evaluation metrics and analysis modules."""
+from .metrics import (
+    precision_at_k,
+    recall_at_k,
+    ndcg_at_k,
+    mean_squared_error,
+    mean_absolute_error,
+    evaluate_model,
+)
+from .error_analysis import (
+    analyze_errors,
+    identify_error_patterns,
+    generate_error_report,
+    PredictionError,
+    ErrorPattern,
+    ErrorReport,
+)
+__all__ = [
+    "precision_at_k",
+    "recall_at_k",
+    "ndcg_at_k",
+    "mean_squared_error",
+    "mean_absolute_error",
+    "evaluate_model",
+    "analyze_errors",
+    "identify_error_patterns",
+    "generate_error_report",
+    "PredictionError",
+    "ErrorPattern",
+    "ErrorReport",
+]

src/brewmatch/evaluation/error_analysis.py ADDED Viewed

	@@ -0,0 +1,586 @@

+"""Error analysis module for the coffee recommendation system.
+This module provides tools for analyzing model errors:
+- Finding worst predictions
+- Identifying error patterns by origin, processing method, etc.
+- Generating comprehensive error reports with mitigation strategies
+"""
+from collections import Counter
+from dataclasses import dataclass, field
+from typing import Any, Protocol, runtime_checkable
+import numpy as np
+@runtime_checkable
+class Recommender(Protocol):
+    """Protocol for recommender models used in error analysis."""
+    def recommend(self, preferences: np.ndarray, k: int = 5) -> list[dict[str, Any]]:
+        """Recommend coffees matching user taste preferences."""
+        ...
+@dataclass
+class PredictionError:
+    """Represents a single misprediction for analysis.
+    Attributes:
+        query_idx: Index of the query coffee in the test set.
+        query_preferences: The query taste profile used.
+        query_metadata: Metadata of the query coffee.
+        recommended_idx: Index of the top recommended coffee.
+        recommended_metadata: Metadata of the recommended coffee.
+        recommended_profile: Taste profile of the recommended coffee.
+        expected_indices: Set of indices that would have been correct.
+        error_magnitude: Quantified error (e.g., Euclidean distance or rank loss).
+        rank_of_first_relevant: Position of first relevant item in recommendations.
+    """
+    query_idx: int
+    query_preferences: np.ndarray
+    query_metadata: dict[str, Any]
+    recommended_idx: int
+    recommended_metadata: dict[str, Any]
+    recommended_profile: np.ndarray
+    expected_indices: set[int]
+    error_magnitude: float
+    rank_of_first_relevant: int | None = None
+@dataclass
+class ErrorPattern:
+    """Represents a common error pattern identified in the analysis.
+    Attributes:
+        pattern_type: Category of the pattern (e.g., 'origin', 'processing', 'profile').
+        description: Human-readable description of the pattern.
+        frequency: Number of errors exhibiting this pattern.
+        affected_queries: List of query indices affected by this pattern.
+        severity: Average error magnitude for this pattern.
+    """
+    pattern_type: str
+    description: str
+    frequency: int
+    affected_queries: list[int] = field(default_factory=list)
+    severity: float = 0.0
+@dataclass
+class ErrorReport:
+    """Comprehensive error analysis report.
+    Attributes:
+        total_queries: Total number of queries evaluated.
+        total_errors: Number of queries where top recommendation was incorrect.
+        error_rate: Proportion of queries with errors.
+        worst_errors: List of the worst prediction errors.
+        patterns: Identified error patterns.
+        mitigations: Suggested mitigation strategies.
+    """
+    total_queries: int
+    total_errors: int
+    error_rate: float
+    worst_errors: list[PredictionError]
+    patterns: list[ErrorPattern]
+    mitigations: list[str]
+def _compute_euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
+    """Compute Euclidean distance between two arrays."""
+    return float(np.sqrt(np.sum((a - b) ** 2)))
+def _compute_similarity(profile_a: np.ndarray, profile_b: np.ndarray) -> float:
+    """Compute cosine similarity between two taste profiles."""
+    norm_a = np.linalg.norm(profile_a)
+    norm_b = np.linalg.norm(profile_b)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return float(np.dot(profile_a, profile_b) / (norm_a * norm_b))
+def _find_relevant_items(
+    query_metadata: dict[str, Any],
+    query_profile: np.ndarray,
+    all_metadata: list[dict[str, Any]],
+    all_profiles: np.ndarray,
+    query_idx: int,
+    similarity_threshold: float = 0.95,
+) -> set[int]:
+    """Identify relevant items for a query coffee.
+    An item is considered relevant if:
+    - It shares the same country AND processing method, OR
+    - It has high taste profile similarity (>= threshold)
+    """
+    relevant = set()
+    query_country = query_metadata.get("Country of Origin", "")
+    query_processing = query_metadata.get("Processing Method", "")
+    for i, meta in enumerate(all_metadata):
+        if i == query_idx:
+            continue
+        same_country = meta.get("Country of Origin", "") == query_country
+        same_processing = meta.get("Processing Method", "") == query_processing
+        if same_country and same_processing and query_country and query_processing:
+            relevant.add(i)
+            continue
+        similarity = _compute_similarity(query_profile, all_profiles[i])
+        if similarity >= similarity_threshold:
+            relevant.add(i)
+    return relevant
+def analyze_errors(
+    model: Recommender,
+    test_data: dict[str, Any],
+    n_errors: int = 5,
+) -> list[PredictionError]:
+    """Find the worst predictions made by the model.
+    Analyzes each test coffee as a query and identifies cases where the
+    model's top recommendations were most incorrect.
+    Args:
+        model: A fitted recommender model with a recommend() method.
+        test_data: Dictionary containing:
+            - 'X': Feature matrix of shape (n_samples, 9) with taste profiles.
+            - 'metadata': List of metadata dicts or DataFrame.
+        n_errors: Number of worst errors to return.
+    Returns:
+        List of PredictionError objects for the n_errors worst predictions,
+        sorted by error magnitude (descending).
+    Example:
+        >>> errors = analyze_errors(model, test_data, n_errors=5)
+        >>> for err in errors:
+        ...     print(f"Query {err.query_idx}: magnitude={err.error_magnitude:.3f}")
+    """
+    X = np.asarray(test_data["X"], dtype=np.float32)
+    metadata_raw = test_data["metadata"]
+    if hasattr(metadata_raw, "to_dict"):
+        all_metadata = metadata_raw.to_dict("records")
+    else:
+        all_metadata = list(metadata_raw)
+    n_samples = len(X)
+    errors: list[PredictionError] = []
+    taste_features = [
+        "Aroma", "Flavor", "Aftertaste", "Acidity", "Body",
+        "Balance", "Uniformity", "Clean Cup", "Sweetness"
+    ]
+    for query_idx in range(n_samples):
+        query_profile = X[query_idx]
+        query_metadata = all_metadata[query_idx]
+        relevant = _find_relevant_items(
+            query_metadata=query_metadata,
+            query_profile=query_profile,
+            all_metadata=all_metadata,
+            all_profiles=X,
+            query_idx=query_idx,
+        )
+        if not relevant:
+            continue
+        recommendations = model.recommend(query_profile, k=max(10, n_errors))
+        recommended_indices = [rec["index"] for rec in recommendations]
+        if not recommendations:
+            continue
+        top_rec = recommendations[0]
+        top_idx = top_rec["index"]
+        # Check if top recommendation is relevant
+        if top_idx in relevant:
+            continue
+        # This is an error - compute error magnitude
+        rec_profile = np.array(
+            [top_rec["taste_profile"][f] for f in taste_features],
+            dtype=np.float32
+        )
+        error_magnitude = _compute_euclidean_distance(query_profile, rec_profile)
+        # Find rank of first relevant item
+        rank_of_first_relevant = None
+        for rank, idx in enumerate(recommended_indices):
+            if idx in relevant:
+                rank_of_first_relevant = rank + 1  # 1-indexed
+                break
+        error = PredictionError(
+            query_idx=query_idx,
+            query_preferences=query_profile.copy(),
+            query_metadata=query_metadata,
+            recommended_idx=top_idx,
+            recommended_metadata=all_metadata[top_idx],
+            recommended_profile=rec_profile,
+            expected_indices=relevant,
+            error_magnitude=error_magnitude,
+            rank_of_first_relevant=rank_of_first_relevant,
+        )
+        errors.append(error)
+    # Sort by error magnitude (descending) and return top n
+    errors.sort(key=lambda e: e.error_magnitude, reverse=True)
+    return errors[:n_errors]
+def identify_error_patterns(errors: list[PredictionError]) -> list[ErrorPattern]:
+    """Analyze a list of errors to identify common failure patterns.
+    Looks for patterns such as:
+    - Failures on specific origins (countries)
+    - Failures on specific processing methods
+    - Failures on certain taste profile characteristics
+    Args:
+        errors: List of PredictionError objects to analyze.
+    Returns:
+        List of ErrorPattern objects describing common failure modes,
+        sorted by frequency (descending).
+    Example:
+        >>> errors = analyze_errors(model, test_data, n_errors=20)
+        >>> patterns = identify_error_patterns(errors)
+        >>> for p in patterns:
+        ...     print(f"{p.pattern_type}: {p.description} ({p.frequency} occurrences)")
+    """
+    if not errors:
+        return []
+    patterns: list[ErrorPattern] = []
+    # Pattern 1: Failures by query origin
+    origin_counter: Counter[str] = Counter()
+    origin_errors: dict[str, list[int]] = {}
+    origin_severity: dict[str, list[float]] = {}
+    for err in errors:
+        origin = err.query_metadata.get("Country of Origin", "Unknown")
+        origin_counter[origin] += 1
+        origin_errors.setdefault(origin, []).append(err.query_idx)
+        origin_severity.setdefault(origin, []).append(err.error_magnitude)
+    for origin, count in origin_counter.most_common():
+        if count >= 2:  # Only report patterns with multiple occurrences
+            avg_severity = np.mean(origin_severity[origin])
+            patterns.append(ErrorPattern(
+                pattern_type="origin",
+                description=f"Model fails frequently on coffees from {origin}",
+                frequency=count,
+                affected_queries=origin_errors[origin],
+                severity=float(avg_severity),
+            ))
+    # Pattern 2: Failures by query processing method
+    processing_counter: Counter[str] = Counter()
+    processing_errors: dict[str, list[int]] = {}
+    processing_severity: dict[str, list[float]] = {}
+    for err in errors:
+        method = err.query_metadata.get("Processing Method", "Unknown")
+        processing_counter[method] += 1
+        processing_errors.setdefault(method, []).append(err.query_idx)
+        processing_severity.setdefault(method, []).append(err.error_magnitude)
+    for method, count in processing_counter.most_common():
+        if count >= 2:
+            avg_severity = np.mean(processing_severity[method])
+            patterns.append(ErrorPattern(
+                pattern_type="processing",
+                description=f"Model fails frequently on {method} processed coffees",
+                frequency=count,
+                affected_queries=processing_errors[method],
+                severity=float(avg_severity),
+            ))
+    # Pattern 3: Cross-origin confusion
+    confusion_counter: Counter[tuple[str, str]] = Counter()
+    confusion_errors: dict[tuple[str, str], list[int]] = {}
+    confusion_severity: dict[tuple[str, str], list[float]] = {}
+    for err in errors:
+        query_origin = err.query_metadata.get("Country of Origin", "Unknown")
+        rec_origin = err.recommended_metadata.get("Country of Origin", "Unknown")
+        if query_origin != rec_origin:
+            key = (query_origin, rec_origin)
+            confusion_counter[key] += 1
+            confusion_errors.setdefault(key, []).append(err.query_idx)
+            confusion_severity.setdefault(key, []).append(err.error_magnitude)
+    for (q_origin, r_origin), count in confusion_counter.most_common(5):
+        if count >= 2:
+            avg_severity = np.mean(confusion_severity[(q_origin, r_origin)])
+            patterns.append(ErrorPattern(
+                pattern_type="cross_origin_confusion",
+                description=f"Model confuses {q_origin} with {r_origin}",
+                frequency=count,
+                affected_queries=confusion_errors[(q_origin, r_origin)],
+                severity=float(avg_severity),
+            ))
+    # Pattern 4: High acidity/low body confusion (taste profile patterns)
+    high_acidity_errors = []
+    high_acidity_indices = []
+    high_acidity_severities = []
+    low_body_errors = []
+    low_body_indices = []
+    low_body_severities = []
+    for err in errors:
+        # Check for high acidity queries (above 7.5 on typical 6-10 scale)
+        acidity_idx = 3  # Index of Acidity in taste features
+        if err.query_preferences[acidity_idx] > 7.5:
+            high_acidity_errors.append(err)
+            high_acidity_indices.append(err.query_idx)
+            high_acidity_severities.append(err.error_magnitude)
+        # Check for low body queries (below 7.0)
+        body_idx = 4  # Index of Body in taste features
+        if err.query_preferences[body_idx] < 7.0:
+            low_body_errors.append(err)
+            low_body_indices.append(err.query_idx)
+            low_body_severities.append(err.error_magnitude)
+    if len(high_acidity_errors) >= 2:
+        patterns.append(ErrorPattern(
+            pattern_type="taste_profile",
+            description="Model struggles with high-acidity coffee recommendations",
+            frequency=len(high_acidity_errors),
+            affected_queries=high_acidity_indices,
+            severity=float(np.mean(high_acidity_severities)),
+        ))
+    if len(low_body_errors) >= 2:
+        patterns.append(ErrorPattern(
+            pattern_type="taste_profile",
+            description="Model struggles with low-body coffee recommendations",
+            frequency=len(low_body_errors),
+            affected_queries=low_body_indices,
+            severity=float(np.mean(low_body_severities)),
+        ))
+    # Pattern 5: Rank degradation (first relevant item is far down)
+    severe_rank_errors = []
+    severe_rank_indices = []
+    severe_rank_severities = []
+    for err in errors:
+        if err.rank_of_first_relevant is not None and err.rank_of_first_relevant > 5:
+            severe_rank_errors.append(err)
+            severe_rank_indices.append(err.query_idx)
+            severe_rank_severities.append(err.error_magnitude)
+    if len(severe_rank_errors) >= 2:
+        avg_rank = np.mean([e.rank_of_first_relevant for e in severe_rank_errors
+                           if e.rank_of_first_relevant is not None])
+        patterns.append(ErrorPattern(
+            pattern_type="ranking",
+            description=f"First relevant item ranked very low (avg rank: {avg_rank:.1f})",
+            frequency=len(severe_rank_errors),
+            affected_queries=severe_rank_indices,
+            severity=float(np.mean(severe_rank_severities)),
+        ))
+    # Sort patterns by frequency
+    patterns.sort(key=lambda p: p.frequency, reverse=True)
+    return patterns
+def _generate_root_cause(error: PredictionError) -> str:
+    """Generate a root cause analysis for a single error."""
+    query_origin = error.query_metadata.get("Country of Origin", "Unknown")
+    query_processing = error.query_metadata.get("Processing Method", "Unknown")
+    rec_origin = error.recommended_metadata.get("Country of Origin", "Unknown")
+    rec_processing = error.recommended_metadata.get("Processing Method", "Unknown")
+    causes = []
+    # Check for origin mismatch
+    if query_origin != rec_origin:
+        causes.append(
+            f"Origin mismatch: queried {query_origin}, recommended {rec_origin}"
+        )
+    # Check for processing mismatch
+    if query_processing != rec_processing:
+        causes.append(
+            f"Processing mismatch: queried {query_processing}, recommended {rec_processing}"
+        )
+    # Check for taste profile deviation
+    taste_features = [
+        "Aroma", "Flavor", "Aftertaste", "Acidity", "Body",
+        "Balance", "Uniformity", "Clean Cup", "Sweetness"
+    ]
+    large_deviations = []
+    for i, feature in enumerate(taste_features):
+        diff = abs(error.query_preferences[i] - error.recommended_profile[i])
+        if diff > 0.5:  # Significant deviation
+            large_deviations.append(f"{feature} (diff: {diff:.2f})")
+    if large_deviations:
+        causes.append(f"Large taste deviations: {', '.join(large_deviations[:3])}")
+    if not causes:
+        causes.append("Minor deviations across multiple dimensions")
+    return "; ".join(causes)
+def _generate_mitigations(patterns: list[ErrorPattern]) -> list[str]:
+    """Generate mitigation strategies based on identified patterns."""
+    mitigations = []
+    pattern_types = {p.pattern_type for p in patterns}
+    if "origin" in pattern_types:
+        mitigations.append(
+            "Consider adding origin-aware features or embeddings to better "
+            "capture regional flavor characteristics."
+        )
+    if "processing" in pattern_types:
+        mitigations.append(
+            "Include processing method as an explicit feature or learn "
+            "processing-specific taste profile transformations."
+        )
+    if "cross_origin_confusion" in pattern_types:
+        mitigations.append(
+            "Add contrastive learning or negative sampling to better distinguish "
+            "coffees from commonly confused origins."
+        )
+    if "taste_profile" in pattern_types:
+        mitigations.append(
+            "Review feature scaling and consider non-linear transformations "
+            "for extreme taste profile values (high acidity, low body)."
+        )
+    if "ranking" in pattern_types:
+        mitigations.append(
+            "Incorporate a re-ranking stage or listwise learning objective "
+            "to improve early-rank precision."
+        )
+    # General mitigations based on error severity
+    if patterns and max(p.severity for p in patterns) > 2.0:
+        mitigations.append(
+            "High severity errors suggest the model may benefit from ensemble "
+            "methods or calibration techniques."
+        )
+    if not mitigations:
+        mitigations.append(
+            "No strong error patterns detected. Consider increasing training "
+            "data or fine-tuning hyperparameters."
+        )
+    return mitigations
+def generate_error_report(
+    model: Recommender,
+    test_data: dict[str, Any],
+) -> ErrorReport:
+    """Generate a comprehensive error analysis report.
+    Analyzes the model's predictions on test data to identify:
+    - The 5 worst mispredictions with root cause analysis
+    - Common failure patterns across errors
+    - Proposed mitigation strategies
+    Args:
+        model: A fitted recommender model with a recommend() method.
+        test_data: Dictionary containing:
+            - 'X': Feature matrix of shape (n_samples, 9) with taste profiles.
+            - 'metadata': List of metadata dicts or DataFrame.
+    Returns:
+        ErrorReport containing detailed analysis and recommendations.
+    Example:
+        >>> report = generate_error_report(model, test_data)
+        >>> print(f"Error rate: {report.error_rate:.1%}")
+        >>> for pattern in report.patterns:
+        ...     print(f"- {pattern.description}")
+        >>> for mitigation in report.mitigations:
+        ...     print(f"* {mitigation}")
+    """
+    X = np.asarray(test_data["X"], dtype=np.float32)
+    metadata_raw = test_data["metadata"]
+    if hasattr(metadata_raw, "to_dict"):
+        all_metadata = metadata_raw.to_dict("records")
+    else:
+        all_metadata = list(metadata_raw)
+    n_samples = len(X)
+    # Get worst errors (more than 5 for pattern analysis)
+    all_errors = analyze_errors(model, test_data, n_errors=50)
+    worst_5_errors = all_errors[:5]
+    # Count total errors (queries where top recommendation is not relevant)
+    total_errors = 0
+    total_valid_queries = 0
+    for query_idx in range(n_samples):
+        query_profile = X[query_idx]
+        query_metadata = all_metadata[query_idx]
+        relevant = _find_relevant_items(
+            query_metadata=query_metadata,
+            query_profile=query_profile,
+            all_metadata=all_metadata,
+            all_profiles=X,
+            query_idx=query_idx,
+        )
+        if not relevant:
+            continue
+        total_valid_queries += 1
+        recommendations = model.recommend(query_profile, k=1)
+        if recommendations and recommendations[0]["index"] not in relevant:
+            total_errors += 1
+    # Identify patterns from all errors
+    patterns = identify_error_patterns(all_errors)
+    # Generate mitigations
+    mitigations = _generate_mitigations(patterns)
+    # Add root cause to worst errors (stored in metadata for reporting)
+    for err in worst_5_errors:
+        err.query_metadata["_root_cause"] = _generate_root_cause(err)
+    error_rate = total_errors / total_valid_queries if total_valid_queries > 0 else 0.0
+    return ErrorReport(
+        total_queries=total_valid_queries,
+        total_errors=total_errors,
+        error_rate=error_rate,
+        worst_errors=worst_5_errors,
+        patterns=patterns,
+        mitigations=mitigations,
+    )

src/brewmatch/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""Evaluation metrics for the coffee recommendation system.
+This module provides metrics for evaluating recommendation quality:
+- Ranking metrics: Precision@K, Recall@K, NDCG@K
+- Regression metrics: MSE, MAE for quality prediction
+- Comprehensive evaluation combining all metrics
+"""
+from typing import Any, Protocol, runtime_checkable
+import numpy as np
+from numpy.typing import ArrayLike
+@runtime_checkable
+class Recommender(Protocol):
+    """Protocol for recommender models used in evaluation."""
+    def recommend(self, preferences: np.ndarray, k: int = 5) -> list[dict[str, Any]]:
+        """Recommend coffees matching user taste preferences."""
+        ...
+def precision_at_k(
+    recommended: list[int],
+    relevant: set[int],
+    k: int,
+) -> float:
+    """Calculate Precision@K for a recommendation list.
+    Precision@K measures the proportion of recommended items in the top-K
+    that are relevant.
+    Args:
+        recommended: List of recommended item indices, ordered by rank.
+        relevant: Set of relevant item indices.
+        k: Number of top recommendations to consider.
+    Returns:
+        Precision@K score in range [0, 1].
+    Example:
+        >>> recommended = [1, 2, 3, 4, 5]
+        >>> relevant = {1, 3, 5, 7, 9}
+        >>> precision_at_k(recommended, relevant, k=5)
+        0.6
+    """
+    if k <= 0:
+        raise ValueError(f"k must be positive, got {k}")
+    if not recommended:
+        return 0.0
+    top_k = recommended[:k]
+    hits = sum(1 for item in top_k if item in relevant)
+    return hits / k
+def recall_at_k(
+    recommended: list[int],
+    relevant: set[int],
+    k: int,
+) -> float:
+    """Calculate Recall@K for a recommendation list.
+    Recall@K measures the proportion of relevant items that appear
+    in the top-K recommendations.
+    Args:
+        recommended: List of recommended item indices, ordered by rank.
+        relevant: Set of relevant item indices.
+        k: Number of top recommendations to consider.
+    Returns:
+        Recall@K score in range [0, 1]. Returns 0.0 if there are no relevant items.
+    Example:
+        >>> recommended = [1, 2, 3, 4, 5]
+        >>> relevant = {1, 3, 5, 7, 9}
+        >>> recall_at_k(recommended, relevant, k=5)
+        0.6
+    """
+    if k <= 0:
+        raise ValueError(f"k must be positive, got {k}")
+    if not relevant:
+        return 0.0
+    if not recommended:
+        return 0.0
+    top_k = recommended[:k]
+    hits = sum(1 for item in top_k if item in relevant)
+    return hits / len(relevant)
+def ndcg_at_k(
+    recommended: list[int],
+    relevant: set[int],
+    k: int,
+) -> float:
+    """Calculate Normalized Discounted Cumulative Gain at K.
+    NDCG@K measures ranking quality by giving higher scores when
+    relevant items appear earlier in the recommendation list.
+    Uses binary relevance (1 if relevant, 0 otherwise).
+    Args:
+        recommended: List of recommended item indices, ordered by rank.
+        relevant: Set of relevant item indices.
+        k: Number of top recommendations to consider.
+    Returns:
+        NDCG@K score in range [0, 1]. Returns 0.0 if there are no relevant items.
+    Example:
+        >>> recommended = [1, 2, 3, 4, 5]
+        >>> relevant = {1, 3, 5}
+        >>> ndcg_at_k(recommended, relevant, k=5)  # Higher because relevant items are ranked well
+        0.934...
+    """
+    if k <= 0:
+        raise ValueError(f"k must be positive, got {k}")
+    if not relevant:
+        return 0.0
+    if not recommended:
+        return 0.0
+    top_k = recommended[:k]
+    # DCG: sum of relevance / log2(position + 1)
+    dcg = 0.0
+    for i, item in enumerate(top_k):
+        if item in relevant:
+            # Position is 1-indexed for the log
+            dcg += 1.0 / np.log2(i + 2)
+    # Ideal DCG: all relevant items ranked first
+    ideal_k = min(k, len(relevant))
+    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_k))
+    if idcg == 0:
+        return 0.0
+    return dcg / idcg
+def mean_squared_error(
+    predicted: ArrayLike,
+    actual: ArrayLike,
+) -> float:
+    """Calculate Mean Squared Error between predictions and actual values.
+    Args:
+        predicted: Predicted values.
+        actual: Actual/ground truth values.
+    Returns:
+        Mean squared error (non-negative).
+    Raises:
+        ValueError: If arrays have different lengths.
+    Example:
+        >>> predicted = [3.0, 4.0, 5.0]
+        >>> actual = [3.5, 4.0, 4.5]
+        >>> mean_squared_error(predicted, actual)
+        0.166...
+    """
+    predicted = np.asarray(predicted, dtype=np.float64)
+    actual = np.asarray(actual, dtype=np.float64)
+    if predicted.shape != actual.shape:
+        raise ValueError(
+            f"Shape mismatch: predicted {predicted.shape} vs actual {actual.shape}"
+        )
+    return float(np.mean((predicted - actual) ** 2))
+def mean_absolute_error(
+    predicted: ArrayLike,
+    actual: ArrayLike,
+) -> float:
+    """Calculate Mean Absolute Error between predictions and actual values.
+    Args:
+        predicted: Predicted values.
+        actual: Actual/ground truth values.
+    Returns:
+        Mean absolute error (non-negative).
+    Raises:
+        ValueError: If arrays have different lengths.
+    Example:
+        >>> predicted = [3.0, 4.0, 5.0]
+        >>> actual = [3.5, 4.0, 4.5]
+        >>> mean_absolute_error(predicted, actual)
+        0.333...
+    """
+    predicted = np.asarray(predicted, dtype=np.float64)
+    actual = np.asarray(actual, dtype=np.float64)
+    if predicted.shape != actual.shape:
+        raise ValueError(
+            f"Shape mismatch: predicted {predicted.shape} vs actual {actual.shape}"
+        )
+    return float(np.mean(np.abs(predicted - actual)))
+def _compute_similarity(profile_a: np.ndarray, profile_b: np.ndarray) -> float:
+    """Compute cosine similarity between two taste profiles."""
+    norm_a = np.linalg.norm(profile_a)
+    norm_b = np.linalg.norm(profile_b)
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return float(np.dot(profile_a, profile_b) / (norm_a * norm_b))
+def _find_relevant_items(
+    query_metadata: dict[str, Any],
+    query_profile: np.ndarray,
+    all_metadata: list[dict[str, Any]],
+    all_profiles: np.ndarray,
+    query_idx: int,
+    similarity_threshold: float = 0.95,
+) -> set[int]:
+    """Identify relevant items for a query coffee.
+    An item is considered relevant if:
+    - It shares the same country AND processing method, OR
+    - It has high taste profile similarity (>= threshold)
+    Args:
+        query_metadata: Metadata dict for the query coffee.
+        query_profile: Taste profile array for the query coffee.
+        all_metadata: List of metadata dicts for all coffees.
+        all_profiles: Array of all taste profiles, shape (n_samples, n_features).
+        query_idx: Index of the query item (excluded from relevant set).
+        similarity_threshold: Cosine similarity threshold for profile-based relevance.
+    Returns:
+        Set of indices of relevant items.
+    """
+    relevant = set()
+    query_country = query_metadata.get("Country of Origin", "")
+    query_processing = query_metadata.get("Processing Method", "")
+    for i, meta in enumerate(all_metadata):
+        if i == query_idx:
+            continue
+        # Check metadata-based relevance
+        same_country = meta.get("Country of Origin", "") == query_country
+        same_processing = meta.get("Processing Method", "") == query_processing
+        if same_country and same_processing and query_country and query_processing:
+            relevant.add(i)
+            continue
+        # Check similarity-based relevance
+        similarity = _compute_similarity(query_profile, all_profiles[i])
+        if similarity >= similarity_threshold:
+            relevant.add(i)
+    return relevant
+def evaluate_model(
+    model: Recommender,
+    test_data: dict[str, Any],
+    k_values: list[int] | None = None,
+) -> dict[str, Any]:
+    """Comprehensive evaluation of a recommendation model.
+    Evaluates the model using each test coffee as a query, measuring how well
+    the model recommends similar coffees (same country/processing or high similarity).
+    Args:
+        model: A fitted recommender model with a recommend() method.
+        test_data: Dictionary containing:
+            - 'X': Feature matrix of shape (n_samples, 9) with taste profiles.
+            - 'metadata': List of metadata dicts or DataFrame.
+        k_values: List of K values for ranking metrics. Defaults to [1, 3, 5, 10].
+    Returns:
+        Dictionary containing:
+            - 'precision@k': Dict mapping k to average Precision@K.
+            - 'recall@k': Dict mapping k to average Recall@K.
+            - 'ndcg@k': Dict mapping k to average NDCG@K.
+            - 'mse': Mean squared error of predicted vs actual taste profiles.
+            - 'mae': Mean absolute error of predicted vs actual taste profiles.
+            - 'n_queries': Number of test queries evaluated.
+            - 'avg_relevant_items': Average number of relevant items per query.
+    Example:
+        >>> results = evaluate_model(model, test_data, k_values=[1, 5, 10])
+        >>> print(f"Precision@5: {results['precision@k'][5]:.3f}")
+        >>> print(f"NDCG@10: {results['ndcg@k'][10]:.3f}")
+    """
+    if k_values is None:
+        k_values = [1, 3, 5, 10]
+    X = np.asarray(test_data["X"], dtype=np.float32)
+    metadata_raw = test_data["metadata"]
+    # Convert metadata to list of dicts if it's a DataFrame
+    if hasattr(metadata_raw, "to_dict"):
+        all_metadata = metadata_raw.to_dict("records")
+    else:
+        all_metadata = list(metadata_raw)
+    n_samples = len(X)
+    max_k = max(k_values)
+    # Initialize accumulators
+    precision_sums = {k: 0.0 for k in k_values}
+    recall_sums = {k: 0.0 for k in k_values}
+    ndcg_sums = {k: 0.0 for k in k_values}
+    total_relevant = 0
+    valid_queries = 0
+    # For MSE/MAE: collect predicted vs actual taste profiles
+    all_predicted_profiles = []
+    all_actual_profiles = []
+    for query_idx in range(n_samples):
+        query_profile = X[query_idx]
+        query_metadata = all_metadata[query_idx]
+        # Find relevant items for this query
+        relevant = _find_relevant_items(
+            query_metadata=query_metadata,
+            query_profile=query_profile,
+            all_metadata=all_metadata,
+            all_profiles=X,
+            query_idx=query_idx,
+        )
+        # Skip queries with no relevant items
+        if not relevant:
+            continue
+        valid_queries += 1
+        total_relevant += len(relevant)
+        # Get recommendations
+        recommendations = model.recommend(query_profile, k=max_k)
+        recommended_indices = [rec["index"] for rec in recommendations]
+        # Calculate ranking metrics for each k
+        for k in k_values:
+            precision_sums[k] += precision_at_k(recommended_indices, relevant, k)
+            recall_sums[k] += recall_at_k(recommended_indices, relevant, k)
+            ndcg_sums[k] += ndcg_at_k(recommended_indices, relevant, k)
+        # For MSE/MAE: compare top recommendation's profile to query profile
+        if recommendations:
+            top_rec = recommendations[0]
+            predicted_profile = np.array(
+                [top_rec["taste_profile"][f] for f in [
+                    "Aroma", "Flavor", "Aftertaste", "Acidity", "Body",
+                    "Balance", "Uniformity", "Clean Cup", "Sweetness"
+                ]],
+                dtype=np.float32
+            )
+            all_predicted_profiles.append(predicted_profile)
+            all_actual_profiles.append(query_profile)
+    # Compute averages
+    results: dict[str, Any] = {
+        "precision@k": {},
+        "recall@k": {},
+        "ndcg@k": {},
+        "n_queries": valid_queries,
+        "avg_relevant_items": total_relevant / valid_queries if valid_queries > 0 else 0.0,
+    }
+    if valid_queries > 0:
+        for k in k_values:
+            results["precision@k"][k] = precision_sums[k] / valid_queries
+            results["recall@k"][k] = recall_sums[k] / valid_queries
+            results["ndcg@k"][k] = ndcg_sums[k] / valid_queries
+    else:
+        for k in k_values:
+            results["precision@k"][k] = 0.0
+            results["recall@k"][k] = 0.0
+            results["ndcg@k"][k] = 0.0
+    # Compute MSE and MAE
+    if all_predicted_profiles:
+        predicted = np.array(all_predicted_profiles)
+        actual = np.array(all_actual_profiles)
+        results["mse"] = mean_squared_error(predicted.flatten(), actual.flatten())
+        results["mae"] = mean_absolute_error(predicted.flatten(), actual.flatten())
+    else:
+        results["mse"] = float("nan")
+        results["mae"] = float("nan")
+    return results

src/brewmatch/experiment.py ADDED Viewed

	@@ -0,0 +1,492 @@

+"""
+Focused Experiment: Training Set Size Sensitivity Analysis
+This experiment investigates how model performance varies with training set size.
+We train all three models (baseline, classical, neural) on progressively larger
+subsets of the training data and measure their performance on a held-out test set.
+Hypothesis: Deep learning model will show greater improvement with more data,
+while classical models may plateau earlier.
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import torch
+from tqdm import tqdm
+from brewmatch.config import (
+    K_VALUES,
+    NEURAL_CONFIG,
+    PROJECT_ROOT,
+    RANDOM_SEED,
+    TASTE_FEATURES,
+)
+from brewmatch.data import load_processed_data
+from brewmatch.models import (
+    NaiveBaselineRecommender,
+    ClassicalMLRecommender,
+    NeuralRecommender,
+)
+from brewmatch.evaluation import evaluate_model
+# Experiment configuration
+TRAIN_FRACTIONS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+N_TRIALS = 3  # Number of trials per fraction for variance estimation
+RESULTS_DIR = PROJECT_ROOT / "experiments"
+def subsample_data(
+    df: pd.DataFrame,
+    fraction: float,
+    seed: int,
+) -> pd.DataFrame:
+    """Subsample training data to a given fraction."""
+    np.random.seed(seed)
+    n_samples = int(len(df) * fraction)
+    indices = np.random.choice(len(df), n_samples, replace=False)
+    return df.iloc[indices].reset_index(drop=True)
+def train_and_evaluate_baseline(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+) -> dict[str, Any]:
+    """Train and evaluate baseline model."""
+    X_train = train_df[TASTE_FEATURES].values
+    model = NaiveBaselineRecommender(strategy="mean")
+    model.fit(X_train, train_df)
+    test_data = {
+        "X": test_df[TASTE_FEATURES].values,
+        "metadata": test_df,
+    }
+    return evaluate_model(
+        model=model,
+        test_data=test_data,
+        k_values=K_VALUES,
+    )
+def train_and_evaluate_classical(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+) -> dict[str, Any]:
+    """Train and evaluate classical ML model."""
+    X_train = train_df[TASTE_FEATURES].values
+    model = ClassicalMLRecommender(method="knn", n_neighbors=50, normalize=True)
+    model.fit(X_train, train_df)
+    test_data = {
+        "X": test_df[TASTE_FEATURES].values,
+        "metadata": test_df,
+    }
+    return evaluate_model(
+        model=model,
+        test_data=test_data,
+        k_values=K_VALUES,
+    )
+def train_and_evaluate_neural(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    device: str,
+) -> dict[str, Any]:
+    """Train and evaluate neural network model."""
+    X_train = train_df[TASTE_FEATURES].values
+    model = NeuralRecommender(
+        embedding_dim=NEURAL_CONFIG["embedding_dim"],
+        hidden_dim=NEURAL_CONFIG.get("hidden_dim", 64),
+        learning_rate=NEURAL_CONFIG["learning_rate"],
+        margin=NEURAL_CONFIG["margin"],
+        device=device,
+    )
+    # Use reduced epochs for experiment speed
+    model.fit(
+        X=X_train,
+        metadata=train_df,
+        epochs=30,  # Reduced for speed
+        batch_size=NEURAL_CONFIG["batch_size"],
+        verbose=False,
+    )
+    test_data = {
+        "X": test_df[TASTE_FEATURES].values,
+        "metadata": test_df,
+    }
+    return evaluate_model(
+        model=model,
+        test_data=test_data,
+        k_values=K_VALUES,
+    )
+def run_experiment(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    device: str,
+    fractions: list[float] = TRAIN_FRACTIONS,
+    n_trials: int = N_TRIALS,
+) -> dict[str, dict[str, list[dict[str, Any]]]]:
+    """
+    Run the full sensitivity analysis experiment.
+    Returns nested dict: {model_name: {fraction: [trial_results]}}
+    """
+    results = {
+        "baseline": {str(f): [] for f in fractions},
+        "classical": {str(f): [] for f in fractions},
+        "neural": {str(f): [] for f in fractions},
+    }
+    total_runs = len(fractions) * n_trials * 3
+    pbar = tqdm(total=total_runs, desc="Running experiment")
+    for fraction in fractions:
+        for trial in range(n_trials):
+            seed = RANDOM_SEED + trial
+            # Subsample training data
+            sub_train_df = subsample_data(train_df, fraction, seed)
+            # Baseline
+            try:
+                baseline_metrics = train_and_evaluate_baseline(sub_train_df, test_df)
+                results["baseline"][str(fraction)].append(baseline_metrics)
+            except Exception as e:
+                print(f"Baseline failed at fraction {fraction}, trial {trial}: {e}")
+            pbar.update(1)
+            # Classical
+            try:
+                classical_metrics = train_and_evaluate_classical(sub_train_df, test_df)
+                results["classical"][str(fraction)].append(classical_metrics)
+            except Exception as e:
+                print(f"Classical failed at fraction {fraction}, trial {trial}: {e}")
+            pbar.update(1)
+            # Neural
+            try:
+                neural_metrics = train_and_evaluate_neural(sub_train_df, test_df, device)
+                results["neural"][str(fraction)].append(neural_metrics)
+            except Exception as e:
+                print(f"Neural failed at fraction {fraction}, trial {trial}: {e}")
+            pbar.update(1)
+    pbar.close()
+    return results
+def aggregate_results(
+    results: dict[str, dict[str, list[dict[str, Any]]]]
+) -> pd.DataFrame:
+    """Aggregate results into a DataFrame with mean and std."""
+    rows = []
+    for model_name, fraction_results in results.items():
+        for fraction, trials in fraction_results.items():
+            if not trials:
+                continue
+            # Flatten nested dicts and aggregate across trials
+            flat_metrics: dict[str, list[float]] = {}
+            for trial in trials:
+                for key, value in trial.items():
+                    if isinstance(value, dict):
+                        # Handle nested metrics like precision@k
+                        for k, v in value.items():
+                            metric_name = f"{key.replace('@k', '')}@{k}"
+                            flat_metrics.setdefault(metric_name, []).append(v)
+                    elif isinstance(value, (int, float)) and not isinstance(value, bool):
+                        flat_metrics.setdefault(key, []).append(value)
+            # Compute mean and std
+            aggregated = {}
+            for metric, values in flat_metrics.items():
+                aggregated[f"{metric}_mean"] = np.mean(values)
+                aggregated[f"{metric}_std"] = np.std(values)
+            rows.append({
+                "model": model_name,
+                "fraction": float(fraction),
+                **aggregated,
+            })
+    return pd.DataFrame(rows)
+def plot_results(df: pd.DataFrame, output_dir: Path) -> None:
+    """Generate visualization of experiment results."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Set style
+    sns.set_style("whitegrid")
+    plt.rcParams["figure.figsize"] = (12, 8)
+    # Get main metric (Precision@5)
+    metric = "precision@5"
+    mean_col = f"{metric}_mean"
+    std_col = f"{metric}_std"
+    if mean_col not in df.columns:
+        # Try first available metric
+        metric_cols = [c for c in df.columns if c.endswith("_mean")]
+        if metric_cols:
+            mean_col = metric_cols[0]
+            std_col = mean_col.replace("_mean", "_std")
+            metric = mean_col.replace("_mean", "")
+    fig, ax = plt.subplots()
+    colors = {"baseline": "#e74c3c", "classical": "#3498db", "neural": "#2ecc71"}
+    for model in ["baseline", "classical", "neural"]:
+        model_df = df[df["model"] == model].sort_values("fraction")
+        if model_df.empty:
+            continue
+        x = model_df["fraction"] * 100  # Convert to percentage
+        y = model_df[mean_col]
+        yerr = model_df[std_col] if std_col in model_df.columns else None
+        ax.errorbar(
+            x, y,
+            yerr=yerr,
+            label=model.capitalize(),
+            color=colors[model],
+            marker="o",
+            linewidth=2,
+            markersize=8,
+            capsize=3,
+        )
+    ax.set_xlabel("Training Data Size (%)", fontsize=12)
+    ax.set_ylabel(f"{metric.replace('@', ' @ ').title()}", fontsize=12)
+    ax.set_title("Model Performance vs Training Set Size", fontsize=14)
+    ax.legend(fontsize=11)
+    ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(output_dir / "sensitivity_analysis.png", dpi=150)
+    plt.close()
+    # Also create a multi-metric plot
+    metric_cols = [c for c in df.columns if c.endswith("_mean") and "@" in c]
+    if len(metric_cols) > 1:
+        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+        axes = axes.flatten()
+        for idx, mean_col in enumerate(metric_cols[:4]):
+            ax = axes[idx]
+            metric = mean_col.replace("_mean", "")
+            std_col = mean_col.replace("_mean", "_std")
+            for model in ["baseline", "classical", "neural"]:
+                model_df = df[df["model"] == model].sort_values("fraction")
+                if model_df.empty:
+                    continue
+                x = model_df["fraction"] * 100
+                y = model_df[mean_col]
+                yerr = model_df.get(std_col)
+                ax.errorbar(
+                    x, y,
+                    yerr=yerr,
+                    label=model.capitalize(),
+                    color=colors[model],
+                    marker="o",
+                    linewidth=2,
+                    capsize=2,
+                )
+            ax.set_xlabel("Training Data (%)")
+            ax.set_ylabel(metric.replace("@", " @ ").title())
+            ax.set_title(metric.replace("@", " @ ").title())
+            ax.legend(fontsize=9)
+            ax.grid(True, alpha=0.3)
+        plt.suptitle("Training Set Size Sensitivity Analysis", fontsize=14)
+        plt.tight_layout()
+        plt.savefig(output_dir / "sensitivity_analysis_multi.png", dpi=150)
+        plt.close()
+    print(f"Plots saved to {output_dir}")
+def generate_report(df: pd.DataFrame, output_dir: Path) -> str:
+    """Generate a text report of the experiment results."""
+    report = []
+    report.append("=" * 60)
+    report.append("SENSITIVITY ANALYSIS: TRAINING SET SIZE VS PERFORMANCE")
+    report.append("=" * 60)
+    report.append("")
+    # Summary statistics
+    report.append("EXPERIMENT SUMMARY")
+    report.append("-" * 40)
+    report.append(f"Training fractions tested: {sorted(df['fraction'].unique())}")
+    report.append(f"Models compared: {sorted(df['model'].unique())}")
+    report.append("")
+    # Best performance per model
+    report.append("BEST PERFORMANCE PER MODEL")
+    report.append("-" * 40)
+    metric_col = [c for c in df.columns if "precision" in c and "_mean" in c]
+    if metric_col:
+        metric_col = metric_col[0]
+        for model in ["baseline", "classical", "neural"]:
+            model_df = df[df["model"] == model]
+            if model_df.empty:
+                continue
+            best_idx = model_df[metric_col].idxmax()
+            best_row = model_df.loc[best_idx]
+            report.append(
+                f"{model.capitalize()}: {best_row[metric_col]:.4f} "
+                f"at {best_row['fraction']*100:.0f}% training data"
+            )
+    report.append("")
+    # Key findings
+    report.append("KEY FINDINGS")
+    report.append("-" * 40)
+    # Check if neural improves more with data
+    if "neural" in df["model"].values and metric_col:
+        neural_df = df[df["model"] == "neural"].sort_values("fraction")
+        if len(neural_df) >= 2:
+            start_perf = neural_df.iloc[0][metric_col]
+            end_perf = neural_df.iloc[-1][metric_col]
+            improvement = (end_perf - start_perf) / start_perf * 100
+            report.append(
+                f"1. Neural model improvement from 10% to 100% data: {improvement:.1f}%"
+            )
+    # Compare final performance
+    if metric_col:
+        final_perfs = df[df["fraction"] == 1.0].set_index("model")[metric_col]
+        if len(final_perfs) > 0:
+            best_model = final_perfs.idxmax()
+            report.append(f"2. Best model at full data: {best_model}")
+    # Check for diminishing returns
+    report.append("3. Diminishing returns analysis: See sensitivity_analysis.png")
+    report.append("")
+    report.append("RECOMMENDATIONS")
+    report.append("-" * 40)
+    report.append("- If data collection is expensive, 50-70% of data may suffice")
+    report.append("- Neural model benefits most from additional data")
+    report.append("- Baseline provides a strong floor with minimal data")
+    report_text = "\n".join(report)
+    # Save report
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / "experiment_report.txt", "w") as f:
+        f.write(report_text)
+    return report_text
+def main():
+    """Main experiment entry point."""
+    parser = argparse.ArgumentParser(
+        description="Run sensitivity analysis experiment"
+    )
+    parser.add_argument(
+        "--fractions",
+        nargs="+",
+        type=float,
+        default=TRAIN_FRACTIONS,
+        help="Training set fractions to test",
+    )
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=N_TRIALS,
+        help="Number of trials per fraction",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for neural network training",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(RESULTS_DIR),
+        help="Directory to save results",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("SENSITIVITY ANALYSIS EXPERIMENT")
+    print("=" * 40)
+    print(f"Training fractions: {args.fractions}")
+    print(f"Trials per fraction: {args.trials}")
+    print(f"Device: {args.device}")
+    print(f"Output directory: {output_dir}")
+    print()
+    # Load data
+    print("Loading data...")
+    data = load_processed_data()
+    train_df = data["train_df"]
+    test_df = data["test_df"]
+    print(f"Training samples: {len(train_df)}")
+    print(f"Test samples: {len(test_df)}")
+    print()
+    # Run experiment
+    print("Running experiment...")
+    results = run_experiment(
+        train_df=train_df,
+        test_df=test_df,
+        device=args.device,
+        fractions=args.fractions,
+        n_trials=args.trials,
+    )
+    # Save raw results
+    with open(output_dir / "raw_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nRaw results saved to {output_dir / 'raw_results.json'}")
+    # Aggregate and analyze
+    df = aggregate_results(results)
+    df.to_csv(output_dir / "aggregated_results.csv", index=False)
+    print(f"Aggregated results saved to {output_dir / 'aggregated_results.csv'}")
+    # Generate visualizations
+    plot_results(df, output_dir)
+    # Generate report
+    report = generate_report(df, output_dir)
+    print("\n" + report)
+    print("\nExperiment complete!")
+if __name__ == "__main__":
+    main()

src/brewmatch/models/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""ML models for coffee recommendation."""
+from .base import BaseRecommender
+from .baseline import NaiveBaselineRecommender
+from .classical import ClassicalMLRecommender
+from .neural import NeuralRecommender
+__all__ = [
+    "BaseRecommender",
+    "NaiveBaselineRecommender",
+    "ClassicalMLRecommender",
+    "NeuralRecommender",
+]

src/brewmatch/models/base.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Abstract base class for all recommender models."""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+class BaseRecommender(ABC):
+    """Abstract base class for coffee recommender models.
+    All recommender implementations must inherit from this class and
+    implement the required methods.
+    Attributes:
+        TASTE_FEATURES: The 9 taste feature columns used for recommendations.
+        is_fitted: Whether the model has been fitted to data.
+    """
+    TASTE_FEATURES = [
+        "Aroma",
+        "Flavor",
+        "Aftertaste",
+        "Acidity",
+        "Body",
+        "Balance",
+        "Uniformity",
+        "Clean Cup",
+        "Sweetness",
+    ]
+    def __init__(self) -> None:
+        self.is_fitted = False
+        self._metadata: pd.DataFrame | None = None
+    @abstractmethod
+    def fit(self, X: np.ndarray, metadata: pd.DataFrame) -> "BaseRecommender":
+        """Fit the recommender to coffee taste profiles.
+        Args:
+            X: Feature matrix of shape (n_samples, 9) containing taste scores.
+               Columns correspond to TASTE_FEATURES in order.
+            metadata: DataFrame containing coffee metadata (country, processing
+                method, variety, etc.). Must have same number of rows as X.
+        Returns:
+            self: The fitted recommender instance.
+        """
+        pass
+    @abstractmethod
+    def recommend(
+        self, preferences: np.ndarray, k: int = 5
+    ) -> list[dict[str, Any]]:
+        """Recommend coffees matching user taste preferences.
+        Args:
+            preferences: Array of shape (9,) containing desired taste scores.
+                Values correspond to TASTE_FEATURES in order.
+            k: Number of recommendations to return.
+        Returns:
+            List of k recommendation dictionaries, each containing:
+                - 'index': Original index in the training data
+                - 'score': Similarity/relevance score (higher is better)
+                - 'metadata': Dict of coffee metadata
+                - 'taste_profile': Dict of the coffee's taste scores
+        """
+        pass
+    @abstractmethod
+    def save(self, path: str | Path) -> None:
+        """Save the fitted model to disk.
+        Args:
+            path: File path to save the model to.
+        """
+        pass
+    @classmethod
+    @abstractmethod
+    def load(cls, path: str | Path) -> "BaseRecommender":
+        """Load a fitted model from disk.
+        Args:
+            path: File path to load the model from.
+        Returns:
+            The loaded recommender instance.
+        """
+        pass
+    def _validate_fitted(self) -> None:
+        """Raise error if model is not fitted."""
+        if not self.is_fitted:
+            raise RuntimeError(
+                f"{self.__class__.__name__} must be fitted before calling this method. "
+                "Call fit() first."
+            )
+    def _validate_preferences(self, preferences: np.ndarray) -> np.ndarray:
+        """Validate and reshape preference array."""
+        preferences = np.asarray(preferences, dtype=np.float32)
+        if preferences.ndim == 1:
+            if preferences.shape[0] != 9:
+                raise ValueError(
+                    f"Expected 9 taste features, got {preferences.shape[0]}"
+                )
+        elif preferences.ndim == 2:
+            if preferences.shape[1] != 9:
+                raise ValueError(
+                    f"Expected 9 taste features, got {preferences.shape[1]}"
+                )
+            preferences = preferences.squeeze()
+        else:
+            raise ValueError(
+                f"Preferences must be 1D or 2D array, got {preferences.ndim}D"
+            )
+        return preferences
+    def _format_recommendation(
+        self, idx: int, score: float, taste_profile: np.ndarray
+    ) -> dict[str, Any]:
+        """Format a single recommendation as a dictionary."""
+        metadata_dict = {}
+        if self._metadata is not None:
+            row = self._metadata.iloc[idx]
+            metadata_dict = row.to_dict()
+        taste_dict = {
+            feature: float(taste_profile[i])
+            for i, feature in enumerate(self.TASTE_FEATURES)
+        }
+        return {
+            "index": idx,
+            "score": float(score),
+            "metadata": metadata_dict,
+            "taste_profile": taste_dict,
+        }

src/brewmatch/models/baseline.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""Naive baseline recommender for establishing performance floor."""
+import pickle
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from .base import BaseRecommender
+class NaiveBaselineRecommender(BaseRecommender):
+    """Naive baseline recommender using global mean or weighted random.
+    This establishes a performance floor for comparison with more
+    sophisticated approaches. It supports two strategies:
+    - 'mean': Recommends coffees closest to the global mean profile,
+      ignoring user preferences entirely.
+    - 'weighted_random': Randomly samples coffees weighted by their
+      Total Cup Points score.
+    Attributes:
+        strategy: The recommendation strategy ('mean' or 'weighted_random').
+    """
+    def __init__(self, strategy: str = "mean") -> None:
+        """Initialize the baseline recommender.
+        Args:
+            strategy: Recommendation strategy. One of:
+                - 'mean': Recommend coffees closest to global mean profile
+                - 'weighted_random': Random sampling weighted by Total Cup Points
+        """
+        super().__init__()
+        if strategy not in ("mean", "weighted_random"):
+            raise ValueError(f"Unknown strategy: {strategy}")
+        self.strategy = strategy
+        self._X: np.ndarray | None = None
+        self._global_mean: np.ndarray | None = None
+        self._weights: np.ndarray | None = None
+        self._rng = np.random.default_rng()
+    def fit(self, X: np.ndarray, metadata: pd.DataFrame) -> "NaiveBaselineRecommender":
+        """Fit the baseline recommender.
+        For 'mean' strategy, computes the global mean profile and
+        distances from each coffee to it.
+        For 'weighted_random' strategy, extracts Total Cup Points as
+        sampling weights.
+        Args:
+            X: Feature matrix of shape (n_samples, 9).
+            metadata: DataFrame with coffee metadata. For 'weighted_random',
+                must contain 'Total Cup Points' column.
+        Returns:
+            self: The fitted recommender.
+        """
+        X = np.asarray(X, dtype=np.float32)
+        if X.shape[1] != 9:
+            raise ValueError(f"Expected 9 features, got {X.shape[1]}")
+        self._X = X
+        self._metadata = metadata.copy()
+        self._global_mean = X.mean(axis=0)
+        if self.strategy == "weighted_random":
+            if "Total Cup Points" not in metadata.columns:
+                raise ValueError(
+                    "metadata must contain 'Total Cup Points' for weighted_random strategy"
+                )
+            scores = metadata["Total Cup Points"].values.astype(np.float32)
+            # Shift to positive and normalize
+            scores = scores - scores.min() + 1.0
+            self._weights = scores / scores.sum()
+        self.is_fitted = True
+        return self
+    def recommend(
+        self, preferences: np.ndarray, k: int = 5
+    ) -> list[dict[str, Any]]:
+        """Generate recommendations.
+        For 'mean' strategy, returns coffees closest to the global mean,
+        ignoring the provided preferences entirely.
+        For 'weighted_random' strategy, returns random coffees sampled
+        proportionally to their Total Cup Points.
+        Args:
+            preferences: User taste preferences (ignored for baseline).
+            k: Number of recommendations.
+        Returns:
+            List of k recommendation dictionaries.
+        """
+        self._validate_fitted()
+        preferences = self._validate_preferences(preferences)
+        n_samples = self._X.shape[0]
+        k = min(k, n_samples)
+        if self.strategy == "mean":
+            # Find coffees closest to global mean (ignoring user preferences)
+            distances = np.linalg.norm(self._X - self._global_mean, axis=1)
+            indices = np.argsort(distances)[:k]
+            # Convert distance to similarity score (higher is better)
+            scores = 1.0 / (1.0 + distances[indices])
+        else:
+            # Weighted random sampling
+            indices = self._rng.choice(
+                n_samples, size=k, replace=False, p=self._weights
+            )
+            # Use weight as score
+            scores = self._weights[indices]
+        recommendations = []
+        for idx, score in zip(indices, scores):
+            rec = self._format_recommendation(idx, score, self._X[idx])
+            recommendations.append(rec)
+        return recommendations
+    def save(self, path: str | Path) -> None:
+        """Save the fitted model to disk using pickle.
+        Args:
+            path: File path to save the model to.
+        """
+        self._validate_fitted()
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        state = {
+            "strategy": self.strategy,
+            "X": self._X,
+            "metadata": self._metadata,
+            "global_mean": self._global_mean,
+            "weights": self._weights,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(state, f)
+    @classmethod
+    def load(cls, path: str | Path) -> "NaiveBaselineRecommender":
+        """Load a fitted model from disk.
+        Args:
+            path: File path to load the model from.
+        Returns:
+            The loaded recommender instance.
+        """
+        with open(path, "rb") as f:
+            state = pickle.load(f)
+        model = cls(strategy=state["strategy"])
+        model._X = state["X"]
+        model._metadata = state["metadata"]
+        model._global_mean = state["global_mean"]
+        model._weights = state["weights"]
+        model.is_fitted = True
+        return model

src/brewmatch/models/classical.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""Classical ML recommender using KNN and cosine similarity."""
+import pickle
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import StandardScaler
+from .base import BaseRecommender
+class ClassicalMLRecommender(BaseRecommender):
+    """Classical ML recommender using KNN or cosine similarity.
+    Finds coffees with taste profiles most similar to user preferences
+    using either KNN with Euclidean distance or cosine similarity ranking.
+    Attributes:
+        method: Similarity method ('knn' or 'cosine').
+        n_neighbors: Maximum neighbors for KNN (used for internal index).
+        normalize: Whether to standardize features before similarity computation.
+    """
+    def __init__(
+        self,
+        method: str = "knn",
+        n_neighbors: int = 50,
+        normalize: bool = True,
+    ) -> None:
+        """Initialize the classical ML recommender.
+        Args:
+            method: Similarity method. One of:
+                - 'knn': K-nearest neighbors with Euclidean distance
+                - 'cosine': Cosine similarity ranking
+            n_neighbors: Maximum neighbors to index for KNN. Actual k in
+                recommend() can be smaller.
+            normalize: Whether to standardize features (recommended for
+                Euclidean distance).
+        """
+        super().__init__()
+        if method not in ("knn", "cosine"):
+            raise ValueError(f"Unknown method: {method}")
+        self.method = method
+        self.n_neighbors = n_neighbors
+        self.normalize = normalize
+        self._X: np.ndarray | None = None
+        self._X_normalized: np.ndarray | None = None
+        self._scaler: StandardScaler | None = None
+        self._knn: NearestNeighbors | None = None
+    def fit(self, X: np.ndarray, metadata: pd.DataFrame) -> "ClassicalMLRecommender":
+        """Fit the recommender to coffee taste profiles.
+        Args:
+            X: Feature matrix of shape (n_samples, 9).
+            metadata: DataFrame with coffee metadata.
+        Returns:
+            self: The fitted recommender.
+        """
+        X = np.asarray(X, dtype=np.float32)
+        if X.shape[1] != 9:
+            raise ValueError(f"Expected 9 features, got {X.shape[1]}")
+        self._X = X
+        self._metadata = metadata.copy()
+        if self.normalize:
+            self._scaler = StandardScaler()
+            self._X_normalized = self._scaler.fit_transform(X).astype(np.float32)
+        else:
+            self._X_normalized = X
+        if self.method == "knn":
+            # Build KNN index
+            n_neighbors = min(self.n_neighbors, X.shape[0])
+            self._knn = NearestNeighbors(
+                n_neighbors=n_neighbors,
+                metric="euclidean",
+                algorithm="auto",
+            )
+            self._knn.fit(self._X_normalized)
+        self.is_fitted = True
+        return self
+    def recommend(
+        self, preferences: np.ndarray, k: int = 5
+    ) -> list[dict[str, Any]]:
+        """Find coffees most similar to user preferences.
+        Args:
+            preferences: User taste preferences of shape (9,).
+            k: Number of recommendations.
+        Returns:
+            List of k recommendation dictionaries.
+        """
+        self._validate_fitted()
+        preferences = self._validate_preferences(preferences)
+        n_samples = self._X.shape[0]
+        k = min(k, n_samples)
+        # Normalize preferences if needed
+        if self.normalize:
+            pref_normalized = self._scaler.transform(
+                preferences.reshape(1, -1)
+            ).astype(np.float32)
+        else:
+            pref_normalized = preferences.reshape(1, -1)
+        if self.method == "knn":
+            indices, scores = self._recommend_knn(pref_normalized, k)
+        else:
+            indices, scores = self._recommend_cosine(pref_normalized, k)
+        recommendations = []
+        for idx, score in zip(indices, scores):
+            rec = self._format_recommendation(idx, score, self._X[idx])
+            recommendations.append(rec)
+        return recommendations
+    def _recommend_knn(
+        self, pref_normalized: np.ndarray, k: int
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Find k nearest neighbors using KNN index."""
+        k = min(k, self._knn.n_neighbors)
+        distances, indices = self._knn.kneighbors(pref_normalized, n_neighbors=k)
+        # Convert distance to similarity (higher is better)
+        # Using inverse distance with offset
+        scores = 1.0 / (1.0 + distances.squeeze())
+        return indices.squeeze(), scores
+    def _recommend_cosine(
+        self, pref_normalized: np.ndarray, k: int
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """Rank all coffees by cosine similarity."""
+        # Compute cosine similarity
+        pref_norm = pref_normalized / (
+            np.linalg.norm(pref_normalized) + 1e-8
+        )
+        X_norms = np.linalg.norm(self._X_normalized, axis=1, keepdims=True) + 1e-8
+        X_normalized_unit = self._X_normalized / X_norms
+        similarities = (X_normalized_unit @ pref_norm.T).squeeze()
+        # Get top k
+        indices = np.argsort(similarities)[::-1][:k]
+        scores = similarities[indices]
+        # Shift to [0, 1] range (cosine similarity is in [-1, 1])
+        scores = (scores + 1.0) / 2.0
+        return indices, scores
+    def save(self, path: str | Path) -> None:
+        """Save the fitted model to disk using pickle.
+        Args:
+            path: File path to save the model to.
+        """
+        self._validate_fitted()
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        state = {
+            "method": self.method,
+            "n_neighbors": self.n_neighbors,
+            "normalize": self.normalize,
+            "X": self._X,
+            "X_normalized": self._X_normalized,
+            "metadata": self._metadata,
+            "scaler": self._scaler,
+            "knn": self._knn,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(state, f)
+    @classmethod
+    def load(cls, path: str | Path) -> "ClassicalMLRecommender":
+        """Load a fitted model from disk.
+        Args:
+            path: File path to load the model from.
+        Returns:
+            The loaded recommender instance.
+        """
+        with open(path, "rb") as f:
+            state = pickle.load(f)
+        model = cls(
+            method=state["method"],
+            n_neighbors=state["n_neighbors"],
+            normalize=state["normalize"],
+        )
+        model._X = state["X"]
+        model._X_normalized = state["X_normalized"]
+        model._metadata = state["metadata"]
+        model._scaler = state["scaler"]
+        model._knn = state["knn"]
+        model.is_fitted = True
+        return model

src/brewmatch/models/neural.py ADDED Viewed

	@@ -0,0 +1,431 @@

+"""Neural network recommender using learned coffee embeddings."""
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from .base import BaseRecommender
+class TasteEncoder(nn.Module):
+    """Neural network that encodes taste profiles into embeddings.
+    Architecture: MLP with residual connections that maps 9 taste features
+    to a lower-dimensional embedding space.
+    """
+    def __init__(
+        self,
+        input_dim: int = 9,
+        hidden_dim: int = 64,
+        embedding_dim: int = 32,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.input_proj = nn.Linear(input_dim, hidden_dim)
+        self.hidden1 = nn.Linear(hidden_dim, hidden_dim)
+        self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
+        self.output_proj = nn.Linear(hidden_dim, embedding_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode taste profiles to embeddings.
+        Args:
+            x: Taste features of shape (batch_size, 9).
+        Returns:
+            Embeddings of shape (batch_size, embedding_dim).
+        """
+        # Project to hidden dimension
+        h = F.gelu(self.input_proj(x))
+        # Residual block 1
+        residual = h
+        h = self.norm1(h)
+        h = F.gelu(self.hidden1(h))
+        h = self.dropout(h)
+        h = h + residual
+        # Residual block 2
+        residual = h
+        h = self.norm2(h)
+        h = F.gelu(self.hidden2(h))
+        h = self.dropout(h)
+        h = h + residual
+        # Project to embedding space
+        embedding = self.output_proj(h)
+        # L2 normalize for cosine similarity
+        embedding = F.normalize(embedding, p=2, dim=-1)
+        return embedding
+class TripletDataset(Dataset):
+    """Dataset that generates triplets for contrastive learning.
+    For each anchor, samples a positive (similar coffee) and negative
+    (dissimilar coffee) based on taste profile distance.
+    """
+    def __init__(
+        self,
+        X: np.ndarray,
+        margin_quantile: float = 0.3,
+    ) -> None:
+        """Initialize triplet dataset.
+        Args:
+            X: Feature matrix of shape (n_samples, 9).
+            margin_quantile: Quantile for positive/negative threshold.
+                Coffees within this distance quantile are positives.
+        """
+        self.X = torch.tensor(X, dtype=torch.float32)
+        self.n_samples = X.shape[0]
+        # Precompute pairwise distances
+        X_tensor = self.X
+        self.distances = torch.cdist(X_tensor, X_tensor, p=2)
+        # Determine threshold for positive/negative
+        flat_distances = self.distances.flatten()
+        self.positive_threshold = torch.quantile(flat_distances, margin_quantile)
+    def __len__(self) -> int:
+        return self.n_samples
+    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Get a triplet (anchor, positive, negative).
+        Args:
+            idx: Anchor index.
+        Returns:
+            Tuple of (anchor, positive, negative) taste profiles.
+        """
+        anchor = self.X[idx]
+        # Get distances from anchor
+        dists = self.distances[idx]
+        # Find positives (close) and negatives (far), excluding self
+        mask = torch.arange(self.n_samples) != idx
+        positive_mask = mask & (dists <= self.positive_threshold)
+        negative_mask = mask & (dists > self.positive_threshold)
+        # Handle edge cases
+        if positive_mask.sum() == 0:
+            # Use closest non-self sample
+            dists_masked = dists.clone()
+            dists_masked[idx] = float("inf")
+            positive_idx = dists_masked.argmin().item()
+        else:
+            positive_indices = torch.where(positive_mask)[0]
+            positive_idx = positive_indices[
+                torch.randint(len(positive_indices), (1,))
+            ].item()
+        if negative_mask.sum() == 0:
+            # Use farthest sample
+            negative_idx = dists.argmax().item()
+        else:
+            negative_indices = torch.where(negative_mask)[0]
+            negative_idx = negative_indices[
+                torch.randint(len(negative_indices), (1,))
+            ].item()
+        return anchor, self.X[positive_idx], self.X[negative_idx]
+class NeuralRecommender(BaseRecommender):
+    """Neural recommender using learned coffee embeddings.
+    Uses contrastive learning with triplet loss to learn embeddings
+    that capture taste similarity. Similar coffees have nearby embeddings.
+    Attributes:
+        embedding_dim: Dimension of learned embeddings.
+        hidden_dim: Hidden layer dimension in encoder.
+        learning_rate: Learning rate for training.
+        margin: Triplet loss margin.
+        device: Torch device (cuda/cpu).
+    """
+    def __init__(
+        self,
+        embedding_dim: int = 32,
+        hidden_dim: int = 64,
+        learning_rate: float = 1e-3,
+        margin: float = 0.5,
+        device: str | None = None,
+    ) -> None:
+        """Initialize the neural recommender.
+        Args:
+            embedding_dim: Embedding dimension.
+            hidden_dim: Hidden layer dimension.
+            learning_rate: Learning rate.
+            margin: Triplet loss margin.
+            device: Torch device. Auto-detects CUDA if not specified.
+        """
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.hidden_dim = hidden_dim
+        self.learning_rate = learning_rate
+        self.margin = margin
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        self._encoder: TasteEncoder | None = None
+        self._X: np.ndarray | None = None
+        self._embeddings: np.ndarray | None = None
+        self._feature_mean: np.ndarray | None = None
+        self._feature_std: np.ndarray | None = None
+    def fit(
+        self,
+        X: np.ndarray,
+        metadata: pd.DataFrame,
+        epochs: int = 100,
+        batch_size: int = 64,
+        verbose: bool = True,
+    ) -> "NeuralRecommender":
+        """Fit the neural recommender using contrastive learning.
+        Args:
+            X: Feature matrix of shape (n_samples, 9).
+            metadata: DataFrame with coffee metadata.
+            epochs: Number of training epochs.
+            batch_size: Training batch size.
+            verbose: Whether to print training progress.
+        Returns:
+            self: The fitted recommender.
+        """
+        X = np.asarray(X, dtype=np.float32)
+        if X.shape[1] != 9:
+            raise ValueError(f"Expected 9 features, got {X.shape[1]}")
+        self._X = X
+        self._metadata = metadata.copy()
+        # Normalize features
+        self._feature_mean = X.mean(axis=0)
+        self._feature_std = X.std(axis=0) + 1e-8
+        X_normalized = (X - self._feature_mean) / self._feature_std
+        # Create encoder
+        self._encoder = TasteEncoder(
+            input_dim=9,
+            hidden_dim=self.hidden_dim,
+            embedding_dim=self.embedding_dim,
+        ).to(self.device)
+        # Create dataset and dataloader
+        dataset = TripletDataset(X_normalized)
+        dataloader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=True,
+            drop_last=False,
+        )
+        # Training
+        optimizer = torch.optim.AdamW(
+            self._encoder.parameters(),
+            lr=self.learning_rate,
+            weight_decay=0.01,
+        )
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=epochs
+        )
+        triplet_loss = nn.TripletMarginLoss(margin=self.margin, p=2)
+        self._encoder.train()
+        for epoch in range(epochs):
+            total_loss = 0.0
+            n_batches = 0
+            for anchor, positive, negative in dataloader:
+                anchor = anchor.to(self.device)
+                positive = positive.to(self.device)
+                negative = negative.to(self.device)
+                optimizer.zero_grad()
+                anchor_emb = self._encoder(anchor)
+                positive_emb = self._encoder(positive)
+                negative_emb = self._encoder(negative)
+                loss = triplet_loss(anchor_emb, positive_emb, negative_emb)
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+                n_batches += 1
+            scheduler.step()
+            if verbose and (epoch + 1) % 10 == 0:
+                avg_loss = total_loss / n_batches
+                print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
+        # Compute embeddings for all coffees
+        self._encoder.eval()
+        with torch.no_grad():
+            X_tensor = torch.tensor(X_normalized, dtype=torch.float32).to(self.device)
+            self._embeddings = self._encoder(X_tensor).cpu().numpy()
+        self.is_fitted = True
+        return self
+    def recommend(
+        self, preferences: np.ndarray, k: int = 5
+    ) -> list[dict[str, Any]]:
+        """Find coffees with embeddings closest to user preferences.
+        Args:
+            preferences: User taste preferences of shape (9,).
+            k: Number of recommendations.
+        Returns:
+            List of k recommendation dictionaries.
+        """
+        self._validate_fitted()
+        preferences = self._validate_preferences(preferences)
+        n_samples = self._X.shape[0]
+        k = min(k, n_samples)
+        # Normalize and encode preferences
+        pref_normalized = (preferences - self._feature_mean) / self._feature_std
+        pref_tensor = torch.tensor(
+            pref_normalized, dtype=torch.float32
+        ).unsqueeze(0).to(self.device)
+        self._encoder.eval()
+        with torch.no_grad():
+            pref_embedding = self._encoder(pref_tensor).cpu().numpy()
+        # Find nearest embeddings using cosine similarity
+        # (embeddings are already L2 normalized)
+        similarities = (self._embeddings @ pref_embedding.T).squeeze()
+        # Get top k
+        indices = np.argsort(similarities)[::-1][:k]
+        scores = similarities[indices]
+        # Shift to [0, 1] range
+        scores = (scores + 1.0) / 2.0
+        recommendations = []
+        for idx, score in zip(indices, scores):
+            rec = self._format_recommendation(idx, score, self._X[idx])
+            recommendations.append(rec)
+        return recommendations
+    def get_embedding(self, preferences: np.ndarray) -> np.ndarray:
+        """Get the embedding for a taste profile.
+        Args:
+            preferences: Taste preferences of shape (9,) or (n, 9).
+        Returns:
+            Embedding(s) of shape (embedding_dim,) or (n, embedding_dim).
+        """
+        self._validate_fitted()
+        preferences = np.asarray(preferences, dtype=np.float32)
+        squeeze_output = False
+        if preferences.ndim == 1:
+            preferences = preferences.reshape(1, -1)
+            squeeze_output = True
+        pref_normalized = (preferences - self._feature_mean) / self._feature_std
+        pref_tensor = torch.tensor(pref_normalized, dtype=torch.float32).to(self.device)
+        self._encoder.eval()
+        with torch.no_grad():
+            embeddings = self._encoder(pref_tensor).cpu().numpy()
+        if squeeze_output:
+            return embeddings.squeeze(0)
+        return embeddings
+    def save(self, path: str | Path) -> None:
+        """Save the fitted model to disk.
+        Args:
+            path: File path to save the model to.
+        """
+        self._validate_fitted()
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        state = {
+            "embedding_dim": self.embedding_dim,
+            "hidden_dim": self.hidden_dim,
+            "learning_rate": self.learning_rate,
+            "margin": self.margin,
+            "encoder_state_dict": self._encoder.state_dict(),
+            "X": self._X,
+            "metadata": self._metadata,
+            "embeddings": self._embeddings,
+            "feature_mean": self._feature_mean,
+            "feature_std": self._feature_std,
+        }
+        torch.save(state, path)
+    @classmethod
+    def load(cls, path: str | Path, device: str | None = None) -> "NeuralRecommender":
+        """Load a fitted model from disk.
+        Args:
+            path: File path to load the model from.
+            device: Torch device. Auto-detects if not specified.
+        Returns:
+            The loaded recommender instance.
+        """
+        state = torch.load(path, map_location="cpu", weights_only=False)
+        model = cls(
+            embedding_dim=state["embedding_dim"],
+            hidden_dim=state["hidden_dim"],
+            learning_rate=state["learning_rate"],
+            margin=state["margin"],
+            device=device,
+        )
+        model._encoder = TasteEncoder(
+            input_dim=9,
+            hidden_dim=state["hidden_dim"],
+            embedding_dim=state["embedding_dim"],
+        ).to(model.device)
+        model._encoder.load_state_dict(state["encoder_state_dict"])
+        model._encoder.eval()
+        model._X = state["X"]
+        model._metadata = state["metadata"]
+        model._embeddings = state["embeddings"]
+        model._feature_mean = state["feature_mean"]
+        model._feature_std = state["feature_std"]
+        model.is_fitted = True
+        return model

src/brewmatch/train.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""Training script for all BrewMatch models.
+Supports hyperparameter tuning with Optuna:
+- `uv run train` - Train with defaults or previously tuned hyperparameters
+- `uv run train --tune` - Run Optuna tuning, save params, then train
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import optuna
+from optuna.samplers import TPESampler
+from optuna.pruners import MedianPruner
+import pandas as pd
+from brewmatch.config import (
+    CHECKPOINTS_DIR,
+    PROJECT_ROOT,
+    TASTE_FEATURES,
+)
+from brewmatch.data import load_processed_data
+from brewmatch.device import get_device, print_device_info
+from brewmatch.models import (
+    NaiveBaselineRecommender,
+    ClassicalMLRecommender,
+    NeuralRecommender,
+)
+from brewmatch.evaluation import evaluate_model
+# Where tuned hyperparameters are saved
+HYPERPARAMS_FILE = CHECKPOINTS_DIR / "hyperparameters.json"
+# Default hyperparameters (used if no tuning has been done)
+DEFAULT_NEURAL_PARAMS = {
+    "embedding_dim": 32,
+    "hidden_dim": 64,
+    "learning_rate": 0.001,
+    "margin": 0.5,
+    "batch_size": 32,
+    "epochs": 100,
+}
+DEFAULT_CLASSICAL_PARAMS = {
+    "method": "knn",
+    "n_neighbors": 50,
+    "normalize": True,
+}
+def load_hyperparameters() -> dict[str, Any]:
+    """Load saved hyperparameters if they exist."""
+    if HYPERPARAMS_FILE.exists():
+        with open(HYPERPARAMS_FILE) as f:
+            return json.load(f)
+    return {}
+def save_hyperparameters(params: dict[str, Any]) -> None:
+    """Save hyperparameters for future runs."""
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+    with open(HYPERPARAMS_FILE, "w") as f:
+        json.dump(params, f, indent=2)
+    print(f"Hyperparameters saved to {HYPERPARAMS_FILE}")
+def get_neural_params(saved: dict[str, Any]) -> dict[str, Any]:
+    """Get neural network params (saved or defaults)."""
+    if "neural" in saved:
+        print("Using tuned neural hyperparameters")
+        return {**DEFAULT_NEURAL_PARAMS, **saved["neural"]}
+    print("Using default neural hyperparameters")
+    return DEFAULT_NEURAL_PARAMS.copy()
+def get_classical_params(saved: dict[str, Any]) -> dict[str, Any]:
+    """Get classical ML params (saved or defaults)."""
+    if "classical" in saved:
+        print("Using tuned classical hyperparameters")
+        return {**DEFAULT_CLASSICAL_PARAMS, **saved["classical"]}
+    print("Using default classical hyperparameters")
+    return DEFAULT_CLASSICAL_PARAMS.copy()
+# =============================================================================
+# Training Functions
+# =============================================================================
+def train_baseline(train_df: pd.DataFrame) -> NaiveBaselineRecommender:
+    """Train the naive baseline model."""
+    print("Training Naive Baseline Model...")
+    X_train = train_df[TASTE_FEATURES].values
+    model = NaiveBaselineRecommender(strategy="mean")
+    model.fit(X_train, train_df)
+    print(f"  Strategy: {model.strategy}")
+    print(f"  Coffees indexed: {len(model._X)}")
+    return model
+def train_classical(train_df: pd.DataFrame, params: dict[str, Any]) -> ClassicalMLRecommender:
+    """Train the classical ML model with given hyperparameters."""
+    print("Training Classical ML Model...")
+    print(f"  Params: {params}")
+    X_train = train_df[TASTE_FEATURES].values
+    model = ClassicalMLRecommender(
+        method=params["method"],
+        n_neighbors=params["n_neighbors"],
+        normalize=params["normalize"],
+    )
+    model.fit(X_train, train_df)
+    print(f"  Coffees indexed: {len(model._X)}")
+    return model
+def train_neural(
+    train_df: pd.DataFrame,
+    params: dict[str, Any],
+    device: str,
+) -> NeuralRecommender:
+    """Train the neural network model with given hyperparameters."""
+    print("Training Neural Network Model...")
+    print(f"  Params: {params}")
+    X_train = train_df[TASTE_FEATURES].values
+    model = NeuralRecommender(
+        embedding_dim=params["embedding_dim"],
+        hidden_dim=params["hidden_dim"],
+        learning_rate=params["learning_rate"],
+        margin=params["margin"],
+        device=device,
+    )
+    model.fit(
+        X=X_train,
+        metadata=train_df,
+        epochs=params.get("epochs", 100),
+        batch_size=params["batch_size"],
+        verbose=True,
+    )
+    return model
+def save_models(
+    baseline: NaiveBaselineRecommender | None,
+    classical: ClassicalMLRecommender | None,
+    neural: NeuralRecommender | None,
+    params: dict[str, Any],
+) -> None:
+    """Save all trained models."""
+    print("\nSaving models...")
+    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+    if baseline:
+        baseline.save(CHECKPOINTS_DIR / "baseline.pkl")
+        print(f"  Baseline: {CHECKPOINTS_DIR / 'baseline.pkl'}")
+    if classical:
+        classical.save(CHECKPOINTS_DIR / "classical.pkl")
+        print(f"  Classical: {CHECKPOINTS_DIR / 'classical.pkl'}")
+    if neural:
+        neural.save(CHECKPOINTS_DIR / "neural.pt")
+        print(f"  Neural: {CHECKPOINTS_DIR / 'neural.pt'}")
+    # Save model metadata
+    model_info = {
+        "models": ["baseline", "classical", "neural"],
+        "taste_features": TASTE_FEATURES,
+        "hyperparameters": params,
+    }
+    with open(CHECKPOINTS_DIR / "model_info.json", "w") as f:
+        json.dump(model_info, f, indent=2)
+# =============================================================================
+# Optuna Hyperparameter Tuning
+# =============================================================================
+def create_cv_splits(
+    df: pd.DataFrame,
+    n_folds: int = 3,
+    seed: int = 42,
+) -> list[tuple[pd.DataFrame, pd.DataFrame]]:
+    """Create cross-validation splits."""
+    np.random.seed(seed)
+    indices = np.random.permutation(len(df))
+    fold_size = len(df) // n_folds
+    splits = []
+    for i in range(n_folds):
+        start = i * fold_size
+        end = start + fold_size if i < n_folds - 1 else len(df)
+        val_idx = indices[start:end]
+        train_idx = np.concatenate([indices[:start], indices[end:]])
+        splits.append((
+            df.iloc[train_idx].reset_index(drop=True),
+            df.iloc[val_idx].reset_index(drop=True),
+        ))
+    return splits
+def tune_neural(
+    train_df: pd.DataFrame,
+    device: str,
+    n_trials: int = 50,
+    n_folds: int = 3,
+) -> dict[str, Any]:
+    """Tune neural network hyperparameters with Optuna."""
+    print(f"\n{'='*60}")
+    print("TUNING NEURAL NETWORK HYPERPARAMETERS")
+    print(f"{'='*60}")
+    print(f"Trials: {n_trials}, CV Folds: {n_folds}")
+    splits = create_cv_splits(train_df, n_folds)
+    def objective(trial: optuna.Trial) -> float:
+        params = {
+            "embedding_dim": trial.suggest_int("embedding_dim", 16, 128, step=16),
+            "hidden_dim": trial.suggest_int("hidden_dim", 32, 256, step=32),
+            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
+            "margin": trial.suggest_float("margin", 0.1, 1.0),
+            "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64, 128]),
+        }
+        scores = []
+        for fold_idx, (fold_train, fold_val) in enumerate(splits):
+            model = NeuralRecommender(
+                embedding_dim=params["embedding_dim"],
+                hidden_dim=params["hidden_dim"],
+                learning_rate=params["learning_rate"],
+                margin=params["margin"],
+                device=device,
+            )
+            model.fit(
+                X=fold_train[TASTE_FEATURES].values,
+                metadata=fold_train,
+                epochs=30,  # Reduced for tuning speed
+                batch_size=params["batch_size"],
+                verbose=False,
+            )
+            metrics = evaluate_model(
+                model,
+                {"X": fold_val[TASTE_FEATURES].values, "metadata": fold_val},
+                k_values=[5],
+            )
+            score = metrics.get("precision@k", {}).get(5, 0.0)
+            scores.append(score)
+            trial.report(np.mean(scores), fold_idx)
+            if trial.should_prune():
+                raise optuna.TrialPruned()
+        return np.mean(scores)
+    study = optuna.create_study(
+        direction="maximize",
+        sampler=TPESampler(seed=42),
+        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=1),
+    )
+    # Suppress Optuna logging
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
+    print(f"\nBest Precision@5: {study.best_value:.4f}")
+    print(f"Best params: {study.best_params}")
+    return study.best_params
+def tune_classical(
+    train_df: pd.DataFrame,
+    n_trials: int = 30,
+    n_folds: int = 3,
+) -> dict[str, Any]:
+    """Tune classical ML hyperparameters with Optuna."""
+    print(f"\n{'='*60}")
+    print("TUNING CLASSICAL ML HYPERPARAMETERS")
+    print(f"{'='*60}")
+    print(f"Trials: {n_trials}, CV Folds: {n_folds}")
+    splits = create_cv_splits(train_df, n_folds)
+    def objective(trial: optuna.Trial) -> float:
+        params = {
+            "method": trial.suggest_categorical("method", ["knn", "cosine"]),
+            "n_neighbors": trial.suggest_int("n_neighbors", 5, 100),
+            "normalize": trial.suggest_categorical("normalize", [True, False]),
+        }
+        scores = []
+        for fold_train, fold_val in splits:
+            model = ClassicalMLRecommender(**params)
+            model.fit(fold_train[TASTE_FEATURES].values, fold_train)
+            metrics = evaluate_model(
+                model,
+                {"X": fold_val[TASTE_FEATURES].values, "metadata": fold_val},
+                k_values=[5],
+            )
+            scores.append(metrics.get("precision@k", {}).get(5, 0.0))
+        return np.mean(scores)
+    study = optuna.create_study(
+        direction="maximize",
+        sampler=TPESampler(seed=42),
+    )
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
+    print(f"\nBest Precision@5: {study.best_value:.4f}")
+    print(f"Best params: {study.best_params}")
+    return study.best_params
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+def main():
+    """Main training entry point."""
+    parser = argparse.ArgumentParser(
+        description="Train BrewMatch recommendation models",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  uv run train                    # Train with defaults or saved hyperparameters
+  uv run train --tune             # Tune hyperparameters, then train
+  uv run train --models neural    # Train only neural network
+  uv run train --tune --neural-trials 100
+        """,
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        choices=["baseline", "classical", "neural", "all"],
+        default=["all"],
+        help="Which models to train (default: all)",
+    )
+    parser.add_argument(
+        "--tune",
+        action="store_true",
+        help="Run Optuna hyperparameter tuning before training",
+    )
+    parser.add_argument(
+        "--neural-trials",
+        type=int,
+        default=50,
+        help="Number of Optuna trials for neural network (default: 50)",
+    )
+    parser.add_argument(
+        "--classical-trials",
+        type=int,
+        default=30,
+        help="Number of Optuna trials for classical ML (default: 30)",
+    )
+    parser.add_argument(
+        "--cv-folds",
+        type=int,
+        default=3,
+        help="Cross-validation folds for tuning (default: 3)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to train on (cuda/mps/cpu, auto-detected if not specified)",
+    )
+    args = parser.parse_args()
+    # Device selection
+    device = get_device(args.device)
+    print_device_info()
+    print()
+    # Expand "all" to all models
+    models_to_train = args.models
+    if "all" in models_to_train:
+        models_to_train = ["baseline", "classical", "neural"]
+    print(f"Models to train: {models_to_train}")
+    # Load data
+    print("\nLoading processed data...")
+    data = load_processed_data()
+    train_df = data["train_df"]
+    test_df = data["test_df"]
+    print(f"  Train: {len(train_df)} samples")
+    print(f"  Test: {len(test_df)} samples")
+    # Load or tune hyperparameters
+    saved_params = load_hyperparameters()
+    if args.tune:
+        print("\n" + "=" * 60)
+        print("HYPERPARAMETER TUNING WITH OPTUNA")
+        print("=" * 60)
+        if "neural" in models_to_train:
+            neural_params = tune_neural(
+                train_df,
+                device=str(device),
+                n_trials=args.neural_trials,
+                n_folds=args.cv_folds,
+            )
+            saved_params["neural"] = neural_params
+        if "classical" in models_to_train:
+            classical_params = tune_classical(
+                train_df,
+                n_trials=args.classical_trials,
+                n_folds=args.cv_folds,
+            )
+            saved_params["classical"] = classical_params
+        # Save tuned hyperparameters
+        save_hyperparameters(saved_params)
+    # Get final hyperparameters
+    neural_params = get_neural_params(saved_params)
+    classical_params = get_classical_params(saved_params)
+    # Train models
+    print("\n" + "=" * 60)
+    print("TRAINING MODELS")
+    print("=" * 60)
+    baseline_model = None
+    classical_model = None
+    neural_model = None
+    if "baseline" in models_to_train:
+        baseline_model = train_baseline(train_df)
+        print()
+    if "classical" in models_to_train:
+        classical_model = train_classical(train_df, classical_params)
+        print()
+    if "neural" in models_to_train:
+        neural_model = train_neural(train_df, neural_params, str(device))
+        print()
+    # Save models
+    all_params = {
+        "neural": neural_params,
+        "classical": classical_params,
+    }
+    save_models(baseline_model, classical_model, neural_model, all_params)
+    print("\nTraining complete!")
+if __name__ == "__main__":
+    main()

src/brewmatch/tuning.py ADDED Viewed

	@@ -0,0 +1,550 @@

+"""
+Hyperparameter Tuning with Optuna
+This module provides automated hyperparameter optimization for all BrewMatch models
+using Optuna's Bayesian optimization framework.
+Optimizes:
+- Neural network: embedding_dim, hidden_dim, learning_rate, margin, batch_size, dropout
+- Classical ML: n_neighbors, method (knn/cosine), normalization
+- Baseline: strategy selection
+Uses cross-validation for robust evaluation and early pruning for efficiency.
+"""
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import optuna
+from optuna.pruners import MedianPruner
+from optuna.samplers import TPESampler
+import pandas as pd
+import torch
+from brewmatch.config import (
+    CHECKPOINTS_DIR,
+    K_VALUES,
+    PROJECT_ROOT,
+    TASTE_FEATURES,
+)
+from brewmatch.data import load_processed_data
+from brewmatch.models import (
+    NaiveBaselineRecommender,
+    ClassicalMLRecommender,
+    NeuralRecommender,
+)
+from brewmatch.evaluation import evaluate_model
+TUNING_DIR = PROJECT_ROOT / "tuning"
+def create_cross_validation_splits(
+    df: pd.DataFrame,
+    n_folds: int = 5,
+    random_state: int = 42,
+) -> list[tuple[pd.DataFrame, pd.DataFrame]]:
+    """Create stratified cross-validation splits."""
+    np.random.seed(random_state)
+    indices = np.random.permutation(len(df))
+    fold_size = len(df) // n_folds
+    splits = []
+    for i in range(n_folds):
+        start = i * fold_size
+        end = start + fold_size if i < n_folds - 1 else len(df)
+        val_indices = indices[start:end]
+        train_indices = np.concatenate([indices[:start], indices[end:]])
+        train_df = df.iloc[train_indices].reset_index(drop=True)
+        val_df = df.iloc[val_indices].reset_index(drop=True)
+        splits.append((train_df, val_df))
+    return splits
+def objective_neural(
+    trial: optuna.Trial,
+    train_df: pd.DataFrame,
+    val_df: pd.DataFrame,
+    device: str,
+) -> float:
+    """Optuna objective function for neural network hyperparameters."""
+    # Sample hyperparameters
+    embedding_dim = trial.suggest_int("embedding_dim", 16, 128, step=16)
+    hidden_dim = trial.suggest_int("hidden_dim", 32, 256, step=32)
+    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
+    margin = trial.suggest_float("margin", 0.1, 1.0)
+    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
+    dropout = trial.suggest_float("dropout", 0.0, 0.5)
+    # Create and train model
+    model = NeuralRecommender(
+        embedding_dim=embedding_dim,
+        hidden_dim=hidden_dim,
+        learning_rate=learning_rate,
+        margin=margin,
+        device=device,
+    )
+    # Train with reduced epochs for tuning
+    X_train = train_df[TASTE_FEATURES].values
+    model.fit(
+        X=X_train,
+        metadata=train_df,
+        epochs=50,  # Reduced for faster tuning
+        batch_size=batch_size,
+        verbose=False,
+    )
+    # Evaluate on validation set
+    val_data = {
+        "X": val_df[TASTE_FEATURES].values,
+        "metadata": val_df,
+    }
+    metrics = evaluate_model(model, val_data, k_values=[5])
+    # Return primary metric (Precision@5)
+    precision_5 = metrics.get("precision@k", {}).get(5, 0.0)
+    return precision_5
+def objective_classical(
+    trial: optuna.Trial,
+    train_df: pd.DataFrame,
+    val_df: pd.DataFrame,
+) -> float:
+    """Optuna objective function for classical ML hyperparameters."""
+    # Sample hyperparameters
+    method = trial.suggest_categorical("method", ["knn", "cosine"])
+    n_neighbors = trial.suggest_int("n_neighbors", 5, 100)
+    normalize = trial.suggest_categorical("normalize", [True, False])
+    # Create and train model
+    model = ClassicalMLRecommender(
+        method=method,
+        n_neighbors=n_neighbors,
+        normalize=normalize,
+    )
+    X_train = train_df[TASTE_FEATURES].values
+    model.fit(X_train, train_df)
+    # Evaluate on validation set
+    val_data = {
+        "X": val_df[TASTE_FEATURES].values,
+        "metadata": val_df,
+    }
+    metrics = evaluate_model(model, val_data, k_values=[5])
+    precision_5 = metrics.get("precision@k", {}).get(5, 0.0)
+    return precision_5
+def tune_neural(
+    train_df: pd.DataFrame,
+    n_trials: int = 50,
+    n_folds: int = 3,
+    device: str = "cuda",
+    study_name: str = "neural_tuning",
+) -> dict[str, Any]:
+    """
+    Tune neural network hyperparameters using Optuna.
+    Args:
+        train_df: Training data
+        n_trials: Number of optimization trials
+        n_folds: Number of cross-validation folds
+        device: PyTorch device
+        study_name: Name for the Optuna study
+    Returns:
+        Dictionary with best parameters and study results
+    """
+    print(f"\n{'='*60}")
+    print("NEURAL NETWORK HYPERPARAMETER TUNING")
+    print(f"{'='*60}")
+    print(f"Trials: {n_trials}, CV Folds: {n_folds}, Device: {device}")
+    # Create cross-validation splits
+    splits = create_cross_validation_splits(train_df, n_folds=n_folds)
+    def cv_objective(trial: optuna.Trial) -> float:
+        """Cross-validated objective."""
+        scores = []
+        for fold_idx, (fold_train, fold_val) in enumerate(splits):
+            score = objective_neural(trial, fold_train, fold_val, device)
+            scores.append(score)
+            # Report intermediate value for pruning
+            trial.report(np.mean(scores), fold_idx)
+            if trial.should_prune():
+                raise optuna.TrialPruned()
+        return np.mean(scores)
+    # Create study with TPE sampler and median pruner
+    study = optuna.create_study(
+        study_name=study_name,
+        direction="maximize",
+        sampler=TPESampler(seed=42),
+        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=1),
+    )
+    study.optimize(
+        cv_objective,
+        n_trials=n_trials,
+        show_progress_bar=True,
+        gc_after_trial=True,
+    )
+    print(f"\nBest trial:")
+    print(f"  Value (Precision@5): {study.best_trial.value:.4f}")
+    print(f"  Params: {study.best_trial.params}")
+    return {
+        "best_params": study.best_trial.params,
+        "best_value": study.best_trial.value,
+        "n_trials": len(study.trials),
+        "study_name": study_name,
+    }
+def tune_classical(
+    train_df: pd.DataFrame,
+    n_trials: int = 30,
+    n_folds: int = 3,
+    study_name: str = "classical_tuning",
+) -> dict[str, Any]:
+    """
+    Tune classical ML hyperparameters using Optuna.
+    Args:
+        train_df: Training data
+        n_trials: Number of optimization trials
+        n_folds: Number of cross-validation folds
+        study_name: Name for the Optuna study
+    Returns:
+        Dictionary with best parameters and study results
+    """
+    print(f"\n{'='*60}")
+    print("CLASSICAL ML HYPERPARAMETER TUNING")
+    print(f"{'='*60}")
+    print(f"Trials: {n_trials}, CV Folds: {n_folds}")
+    splits = create_cross_validation_splits(train_df, n_folds=n_folds)
+    def cv_objective(trial: optuna.Trial) -> float:
+        scores = []
+        for fold_train, fold_val in splits:
+            score = objective_classical(trial, fold_train, fold_val)
+            scores.append(score)
+        return np.mean(scores)
+    study = optuna.create_study(
+        study_name=study_name,
+        direction="maximize",
+        sampler=TPESampler(seed=42),
+    )
+    study.optimize(
+        cv_objective,
+        n_trials=n_trials,
+        show_progress_bar=True,
+    )
+    print(f"\nBest trial:")
+    print(f"  Value (Precision@5): {study.best_trial.value:.4f}")
+    print(f"  Params: {study.best_trial.params}")
+    return {
+        "best_params": study.best_trial.params,
+        "best_value": study.best_trial.value,
+        "n_trials": len(study.trials),
+        "study_name": study_name,
+    }
+def train_with_best_params(
+    train_df: pd.DataFrame,
+    test_df: pd.DataFrame,
+    neural_params: dict[str, Any] | None,
+    classical_params: dict[str, Any] | None,
+    device: str,
+) -> dict[str, Any]:
+    """Train final models with best hyperparameters and evaluate on test set."""
+    print(f"\n{'='*60}")
+    print("TRAINING FINAL MODELS WITH BEST PARAMETERS")
+    print(f"{'='*60}")
+    results = {}
+    test_data = {
+        "X": test_df[TASTE_FEATURES].values,
+        "metadata": test_df,
+    }
+    # Train neural with best params
+    if neural_params:
+        print("\nTraining Neural Network with tuned hyperparameters...")
+        model = NeuralRecommender(
+            embedding_dim=neural_params["embedding_dim"],
+            hidden_dim=neural_params["hidden_dim"],
+            learning_rate=neural_params["learning_rate"],
+            margin=neural_params["margin"],
+            device=device,
+        )
+        X_train = train_df[TASTE_FEATURES].values
+        model.fit(
+            X=X_train,
+            metadata=train_df,
+            epochs=100,  # Full training
+            batch_size=neural_params["batch_size"],
+            verbose=True,
+        )
+        metrics = evaluate_model(model, test_data, k_values=K_VALUES)
+        results["neural"] = {
+            "params": neural_params,
+            "metrics": metrics,
+        }
+        # Save model
+        CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
+        model.save(CHECKPOINTS_DIR / "neural.pt")
+        print(f"Saved tuned neural model to {CHECKPOINTS_DIR / 'neural.pt'}")
+    # Train classical with best params
+    if classical_params:
+        print("\nTraining Classical ML with tuned hyperparameters...")
+        model = ClassicalMLRecommender(
+            method=classical_params["method"],
+            n_neighbors=classical_params["n_neighbors"],
+            normalize=classical_params["normalize"],
+        )
+        X_train = train_df[TASTE_FEATURES].values
+        model.fit(X_train, train_df)
+        metrics = evaluate_model(model, test_data, k_values=K_VALUES)
+        results["classical"] = {
+            "params": classical_params,
+            "metrics": metrics,
+        }
+        model.save(CHECKPOINTS_DIR / "classical.pkl")
+        print(f"Saved tuned classical model to {CHECKPOINTS_DIR / 'classical.pkl'}")
+    # Also train baseline for comparison
+    print("\nTraining Baseline for comparison...")
+    baseline = NaiveBaselineRecommender(strategy="mean")
+    baseline.fit(train_df[TASTE_FEATURES].values, train_df)
+    baseline_metrics = evaluate_model(baseline, test_data, k_values=K_VALUES)
+    results["baseline"] = {"metrics": baseline_metrics}
+    baseline.save(CHECKPOINTS_DIR / "baseline.pkl")
+    return results
+def generate_tuning_report(
+    neural_results: dict[str, Any] | None,
+    classical_results: dict[str, Any] | None,
+    final_results: dict[str, Any],
+    output_dir: Path,
+) -> str:
+    """Generate a comprehensive tuning report."""
+    report = []
+    report.append("=" * 60)
+    report.append("HYPERPARAMETER TUNING REPORT")
+    report.append("=" * 60)
+    report.append("")
+    if neural_results:
+        report.append("NEURAL NETWORK")
+        report.append("-" * 40)
+        report.append(f"Trials completed: {neural_results['n_trials']}")
+        report.append(f"Best CV Precision@5: {neural_results['best_value']:.4f}")
+        report.append("Best hyperparameters:")
+        for param, value in neural_results["best_params"].items():
+            report.append(f"  - {param}: {value}")
+        report.append("")
+    if classical_results:
+        report.append("CLASSICAL ML")
+        report.append("-" * 40)
+        report.append(f"Trials completed: {classical_results['n_trials']}")
+        report.append(f"Best CV Precision@5: {classical_results['best_value']:.4f}")
+        report.append("Best hyperparameters:")
+        for param, value in classical_results["best_params"].items():
+            report.append(f"  - {param}: {value}")
+        report.append("")
+    report.append("FINAL TEST SET PERFORMANCE")
+    report.append("-" * 40)
+    for model_name, result in final_results.items():
+        metrics = result["metrics"]
+        p5 = metrics.get("precision@k", {}).get(5, 0)
+        ndcg5 = metrics.get("ndcg@k", {}).get(5, 0)
+        report.append(f"{model_name.upper()}:")
+        report.append(f"  Precision@5: {p5:.4f}")
+        report.append(f"  NDCG@5: {ndcg5:.4f}")
+    report.append("")
+    # Improvement analysis
+    if "baseline" in final_results and "neural" in final_results:
+        baseline_p5 = final_results["baseline"]["metrics"].get("precision@k", {}).get(5, 0)
+        neural_p5 = final_results["neural"]["metrics"].get("precision@k", {}).get(5, 0)
+        if baseline_p5 > 0:
+            improvement = (neural_p5 - baseline_p5) / baseline_p5 * 100
+            report.append(f"Neural improvement over baseline: {improvement:+.1f}%")
+    report_text = "\n".join(report)
+    # Save report
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with open(output_dir / "tuning_report.txt", "w") as f:
+        f.write(report_text)
+    return report_text
+def main():
+    """Main entry point for hyperparameter tuning."""
+    parser = argparse.ArgumentParser(
+        description="Tune BrewMatch model hyperparameters with Optuna"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        choices=["neural", "classical", "all"],
+        default=["all"],
+        help="Which models to tune",
+    )
+    parser.add_argument(
+        "--neural-trials",
+        type=int,
+        default=50,
+        help="Number of trials for neural network tuning",
+    )
+    parser.add_argument(
+        "--classical-trials",
+        type=int,
+        default=30,
+        help="Number of trials for classical ML tuning",
+    )
+    parser.add_argument(
+        "--cv-folds",
+        type=int,
+        default=3,
+        help="Number of cross-validation folds",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device for neural network training",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=str(TUNING_DIR),
+        help="Directory to save tuning results",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    models_to_tune = args.models
+    if "all" in models_to_tune:
+        models_to_tune = ["neural", "classical"]
+    print("HYPERPARAMETER TUNING WITH OPTUNA")
+    print("=" * 60)
+    print(f"Models to tune: {models_to_tune}")
+    print(f"Neural trials: {args.neural_trials}")
+    print(f"Classical trials: {args.classical_trials}")
+    print(f"CV folds: {args.cv_folds}")
+    print(f"Device: {args.device}")
+    print(f"Output: {output_dir}")
+    # Load data
+    print("\nLoading data...")
+    data = load_processed_data()
+    train_df = data["train_df"]
+    test_df = data["test_df"]
+    print(f"Train: {len(train_df)}, Test: {len(test_df)}")
+    # Tune models
+    neural_results = None
+    classical_results = None
+    if "neural" in models_to_tune:
+        neural_results = tune_neural(
+            train_df=train_df,
+            n_trials=args.neural_trials,
+            n_folds=args.cv_folds,
+            device=args.device,
+        )
+        # Save neural results
+        with open(output_dir / "neural_tuning.json", "w") as f:
+            json.dump(neural_results, f, indent=2)
+    if "classical" in models_to_tune:
+        classical_results = tune_classical(
+            train_df=train_df,
+            n_trials=args.classical_trials,
+            n_folds=args.cv_folds,
+        )
+        # Save classical results
+        with open(output_dir / "classical_tuning.json", "w") as f:
+            json.dump(classical_results, f, indent=2)
+    # Train final models with best params
+    final_results = train_with_best_params(
+        train_df=train_df,
+        test_df=test_df,
+        neural_params=neural_results["best_params"] if neural_results else None,
+        classical_params=classical_results["best_params"] if classical_results else None,
+        device=args.device,
+    )
+    # Save final results
+    with open(output_dir / "final_results.json", "w") as f:
+        # Convert metrics to JSON-serializable format
+        json_results = {}
+        for model_name, result in final_results.items():
+            json_results[model_name] = {
+                "params": result.get("params", {}),
+                "metrics": {
+                    k: {str(kk): vv for kk, vv in v.items()} if isinstance(v, dict) else v
+                    for k, v in result["metrics"].items()
+                },
+            }
+        json.dump(json_results, f, indent=2)
+    # Generate report
+    report = generate_tuning_report(
+        neural_results=neural_results,
+        classical_results=classical_results,
+        final_results=final_results,
+        output_dir=output_dir,
+    )
+    print("\n" + report)
+    print(f"\nResults saved to {output_dir}")
+if __name__ == "__main__":
+    main()

src/brewmatch/utils.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""Utility functions for BrewMatch."""
+import json
+import pickle
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+import numpy as np
+import torch
+def save_pickle(obj: Any, path: Union[str, Path]) -> None:
+    """Save object to pickle file."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "wb") as f:
+        pickle.dump(obj, f)
+def load_pickle(path: Union[str, Path]) -> Any:
+    """Load object from pickle file."""
+    with open(path, "rb") as f:
+        return pickle.load(f)
+def save_json(obj: Dict, path: Union[str, Path], indent: int = 2) -> None:
+    """Save dict to JSON file."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(obj, f, indent=indent)
+def load_json(path: Union[str, Path]) -> Dict:
+    """Load dict from JSON file."""
+    with open(path, "r") as f:
+        return json.load(f)
+def set_seed(seed: int) -> None:
+    """Set random seeds for reproducibility."""
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """
+    Compute cosine similarity between vectors.
+    Args:
+        a: Query vector(s) of shape (d,) or (n, d)
+        b: Reference vectors of shape (m, d)
+    Returns:
+        Similarity scores of shape (n, m) or (m,)
+    """
+    if a.ndim == 1:
+        a = a.reshape(1, -1)
+        squeeze = True
+    else:
+        squeeze = False
+    # Normalize
+    a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
+    b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-8)
+    # Compute similarity
+    sim = np.dot(a_norm, b_norm.T)
+    if squeeze:
+        sim = sim.squeeze(0)
+    return sim
+def euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """
+    Compute Euclidean distance between vectors.
+    Args:
+        a: Query vector(s) of shape (d,) or (n, d)
+        b: Reference vectors of shape (m, d)
+    Returns:
+        Distance scores of shape (n, m) or (m,)
+    """
+    if a.ndim == 1:
+        a = a.reshape(1, -1)
+        squeeze = True
+    else:
+        squeeze = False
+    # Compute distances using broadcasting
+    # ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a.b
+    a_sq = np.sum(a ** 2, axis=1, keepdims=True)
+    b_sq = np.sum(b ** 2, axis=1).reshape(1, -1)
+    dist_sq = a_sq + b_sq - 2 * np.dot(a, b.T)
+    dist = np.sqrt(np.maximum(dist_sq, 0))
+    if squeeze:
+        dist = dist.squeeze(0)
+    return dist
+def normalize_preferences(
+    preferences: Dict[str, float],
+    feature_names: list,
+    scaler: Optional[Any] = None,
+) -> np.ndarray:
+    """
+    Convert user preferences dict to normalized feature vector.
+    Args:
+        preferences: Dict mapping feature names to values (0-10 scale)
+        feature_names: List of feature names in order
+        scaler: Optional sklearn scaler for normalization
+    Returns:
+        Normalized feature vector
+    """
+    # Build vector in correct order
+    vector = np.array([preferences.get(name, 5.0) for name in feature_names])
+    vector = vector.reshape(1, -1)
+    if scaler is not None:
+        vector = scaler.transform(vector)
+    return vector.squeeze()
+def format_recommendations(
+    indices: np.ndarray,
+    similarities: np.ndarray,
+    metadata: Any,
+    feature_names: list,
+    features: np.ndarray,
+) -> list:
+    """
+    Format recommendation results for API response.
+    Args:
+        indices: Array of recommended coffee indices
+        similarities: Similarity scores for recommendations
+        metadata: DataFrame or dict with coffee metadata
+        feature_names: List of taste feature names
+        features: Feature matrix for coffees
+    Returns:
+        List of recommendation dicts
+    """
+    recommendations = []
+    for idx, sim in zip(indices, similarities):
+        rec = {
+            "id": int(idx),
+            "similarity": float(sim),
+            "scores": {
+                name: float(features[idx, i])
+                for i, name in enumerate(feature_names)
+            },
+        }
+        # Add metadata if available
+        if hasattr(metadata, "iloc"):
+            row = metadata.iloc[idx]
+            rec["country"] = str(row.get("Country of Origin", "Unknown"))
+            rec["processing_method"] = str(row.get("Processing Method", "Unknown"))
+            rec["total_cup_points"] = float(row.get("Total Cup Points", 0))
+        elif isinstance(metadata, dict):
+            rec["country"] = metadata.get("countries", ["Unknown"] * len(indices))[idx]
+            rec["processing_method"] = metadata.get(
+                "processing_methods", ["Unknown"] * len(indices)
+            )[idx]
+        recommendations.append(rec)
+    return recommendations

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff